wCMF  3.6
 All Classes Namespaces Files Functions Variables Groups Pages
class.SearchUtil.php
Go to the documentation of this file.
1 <?php
2 /**
3  * wCMF - wemove Content Management Framework
4  * Copyright (C) 2005-2014 wemove digital solutions GmbH
5  *
6  * Licensed under the terms of any of the following licenses
7  * at your choice:
8  *
9  * - GNU Lesser General Public License (LGPL)
10  * http://www.gnu.org/licenses/lgpl.html
11  * - Eclipse Public License (EPL)
12  * http://www.eclipse.org/org/documents/epl-v10.php
13  *
14  * See the license.txt file distributed with this work for
15  * additional information.
16  *
17  * $Id: class.SearchUtil.php 1462 2014-02-04 23:52:27Z iherwig $
18  */
19 set_include_path(get_include_path().PATH_SEPARATOR.BASE.'wcmf/3rdparty/zend');
20 
21 require_once BASE.'wcmf/3rdparty/zend/Zend/Search/Lucene.php';
22 require_once BASE.'wcmf/3rdparty/zend/Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
23 require_once BASE.'wcmf/lib/util/class.InifileParser.php';
24 require_once BASE.'wcmf/lib/util/class.StringUtil.php';
25 
26 /**
27  * @class SearchUtil
28  * @ingroup Util
29  * @brief This class provides access to the search based on Zend_Search_Lucene.
30  * The search index stored in the location that is defined by the configuration key 'indexPath'
31  * in the configuration section 'search'. To manage PersistentObjects in the index use the
32  * methods SearchUtil::indexInSearch() and SearchIndex::deleteFromSearch() and SearchUtil::commitIndex().
33  * The method SearchUtil::getIndex() offers direct access to the search index for advanced operations.
34  *
35  * @author Niko <enikao@users.sourceforge.net>
36  */
38 {
39  const INI_SECTION = 'search';
40  const INI_INDEX_PATH = 'indexPath';
41 
42  private static $isActivated = null;
43  private static $index;
44  private static $indexPath;
45  private static $indexIsDirty = false;
46 
47  /**
48  * Get the search index.
49  * @param create Boolean whether to create the index, if it does not exist [default: true]
50  * @return An instance of Zend_Search_Lucene_Interface or null
51  */
52  public static function getIndex($create = true)
53  {
54  if (!self::isActivated()) {
55  return null;
56  }
57 
58  if (!self::$index || $create)
59  {
60  $indexPath = self::getIndexPath();
61 
62  $analyzer = new Analyzer();
63 
64  // add stop words filter
65  $stopWords = self::getStopWords();
66  $stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords($stopWords);
67  $analyzer->addFilter($stopWordsFilter);
68 
69  Zend_Search_Lucene_Analysis_Analyzer::setDefault($analyzer);
70  Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength(0);
71  Zend_Search_Lucene_Search_QueryParser::setDefaultEncoding('UTF-8');
72  Zend_Search_Lucene_Search_QueryParser::setDefaultOperator(Zend_Search_Lucene_Search_QueryParser::B_AND);
73 
74  try {
75  self::$index = Zend_Search_Lucene::open($indexPath);
76  //self::$index->setMaxMergeDocs(5);
77  //self::$index->setMergeFactor(5);
78  }
79  catch (Zend_Search_Lucene_Exception $ex) {
80  self::$index = self::resetIndex();
81  }
82  }
83  return self::$index;
84  }
85 
86  /**
87  * Reset the search index.
88  */
89  public static function resetIndex()
90  {
91  if (!self::isActivated()) {
92  return;
93  }
94  $indexPath = self::getIndexPath();
95  return Zend_Search_Lucene::create($indexPath);
96  }
97 
98  /**
99  * Add a PersistentObject instance to the search index. This method modifies the
100  * index. For that reason SearchUtil::commitIndex() should be called afterwards.
101  * @param obj The PersistentObject instance.
102  */
103  public static function indexInSearch(&$obj)
104  {
105  if (!self::isActivated()) {
106  return;
107  }
108  if ($obj->isIndexInSearch()) {
109  $index = self::getIndex();
110 
111  $doc = new Zend_Search_Lucene_Document();
112 
113  $valueNames = $obj->getValueNames(DATATYPE_ATTRIBUTE);
114 
115  $doc->addField(Zend_Search_Lucene_Field::unIndexed('oid', $obj->getOID(), 'UTF-8'));
116  $typeField = Zend_Search_Lucene_Field::keyword('type', $obj->getType(), 'UTF-8');
117  $typeField->isStored = false;
118  $doc->addField($typeField);
119 
120  foreach ($valueNames as $curValueName) {
121  $properties = $obj->getValueProperties($curValueName);
122  $inputType = $properties['input_type'];
123  $value = self::encodeValue($obj->getValue($curValueName, DATATYPE_ATTRIBUTE), $inputType);
124  if (preg_match('/^text|^f?ckeditor/', $inputType)) {
125  $value = strip_tags($value);
126  $doc->addField(Zend_Search_Lucene_Field::unStored($curValueName, $value, 'UTF-8'));
127  }
128  else {
129  $field = Zend_Search_Lucene_Field::keyword($curValueName, $value, 'UTF-8');
130  $field->isStored = false;
131  $doc->addField($field);
132  }
133  }
134 
135  $term = new Zend_Search_Lucene_Index_Term($obj->getOID(), 'oid');
136  $docIds = $index->termDocs($term);
137  foreach ($docIds as $id) {
138  $index->delete($id);
139  }
140 
141  $index->addDocument($doc);
142  self::$indexIsDirty = true;
143  }
144  }
145 
146  private static function encodeValue($value, $inputType)
147  {
148  if (preg_match('/^f?ckeditor/', $inputType)) {
149  $value = html_entity_decode($value, ENT_QUOTES, 'UTF-8');
150  }
151  return trim($value);
152  }
153 
154  /**
155  * Delete a PersistentObject instance from the search index.
156  * @param obj The PersistentObject instance.
157  */
158  public static function deleteFromSearch(&$obj)
159  {
160  if (!self::isActivated()) {
161  return;
162  }
163  if ($obj->isIndexInSearch())
164  {
165  $index = self::getIndex();
166 
167  $term = new Zend_Search_Lucene_Index_Term($obj->getOID(), 'oid');
168  $docIds = $index->termDocs($term);
169  foreach ($docIds as $id)
170  {
171  $index->delete($id);
172  }
173  self::$indexIsDirty = true;
174  }
175  }
176 
177  /**
178  * Commit any changes made by using SearchUtil::indexInSearch() and SearchIndex::deleteFromSearch().
179  * @note This method only commits the index if changes were made using the methods mentioned above.
180  * @param optimize Boolean whether the index should be optimized after commit [default: true].
181  */
182  public static function commitIndex($optimize = true)
183  {
184  if (!self::isActivated()) {
185  return;
186  }
187  if (self::$indexIsDirty)
188  {
189  $index = self::getIndex(false);
190  if ($index) {
191  $index->commit();
192  if ($optimize) {
193  $index->optimize();
194  }
195  }
196  }
197  }
198 
199  /**
200  * Optimize the index
201  */
202  public static function optimizeIndex()
203  {
204  if (!self::isActivated()) {
205  return;
206  }
207  $index = self::getIndex(false);
208  if ($index) {
209  $index->optimize();
210  }
211  }
212 
213  /**
214  * Get the path to the index.
215  * @return The path.
216  */
217  private static function getIndexPath()
218  {
219  if (!self::$indexPath)
220  {
221  $parser = InifileParser::getInstance();
222  if (($path = $parser->getValue(self::INI_INDEX_PATH, self::INI_SECTION)) !== false)
223  {
224  self::$indexPath = BASE . 'application/' . $path;
225 
226  if (!file_exists(self::$indexPath)) {
227  FileUtil::mkdirRec(self::$indexPath);
228  }
229 
230  if (!is_writeable(self::$indexPath)) {
231  Log::error("Index path '".self::$indexPath."' is not writeable.", __CLASS__);
232  }
233 
234  Log::debug("Lucene index location: ".self::$indexPath, __CLASS__);
235  }
236  else
237  {
238  Log::error($parser->getErrorMsg(), __CLASS__);
239  }
240  }
241  return self::$indexPath;
242  }
243 
244  /**
245  * Check if a index path is defined in the configuration.
246  * @return Boolean
247  */
248  public static function isActivated()
249  {
250  if (self::$isActivated === null) {
251  $parser = InifileParser::getInstance();
252  self::$isActivated = $parser->getValue(self::INI_INDEX_PATH, self::INI_SECTION) !== false;
253  }
254  return self::$isActivated;
255  }
256 
257  /**
258  * Get a list of words that are forbidden to search for
259  * @return Array
260  */
261  public static function getStopWords()
262  {
263  return explode("\n", $GLOBALS['STOP_WORDS']);
264  }
265 
266  /**
267  * Search for searchTerm in index
268  * @param searchTerm
269  * @param pagingInfo
270  * @return Associative array with object ids as keys and
271  * associative array with keys 'oid', 'score', 'summary' as value
272  */
273  public static function find($searchTerm, &$pagingInfo)
274  {
275  $results = array();
276  if (!self::isActivated()) {
277  return $results;
278  }
279  $index = self::getIndex(false);
280  if ($index) {
281  $persistenceFacade = PersistenceFacade::getInstance();
282  $query = Zend_Search_Lucene_Search_QueryParser::parse($searchTerm, 'UTF-8');
283  try {
284  $hits = $index->find($query);
285  if ($pagingInfo != null && $pagingInfo->getPageSize() > 0) {
286  $pagingInfo->setTotalCount(sizeof($hits));
287  $hits = array_slice($hits, $pagingInfo->getIndex(), $pagingInfo->getPageSize());
288  }
289  foreach($hits as $hit) {
290  $oid = $hit->oid;
291 
292  // get the summary with highlighted text
293  $summary = '';
294  $highlightedRegex = '/((<b style="color:black;background-color:#[0-9a-f]{6}">)+)([^<]+?)((<\/b>)+)/';
295  $obj = $persistenceFacade->load($oid, BUILDDEPTH_SINGLE);
296  $valueNames = $obj->getValueNames(DATATYPE_ATTRIBUTE);
297  foreach ($valueNames as $curValueName) {
298  $properties = $obj->getValueProperties($curValueName);
299  $inputType = $properties['input_type'];
300  $value = self::encodeValue($obj->getValue($curValueName, DATATYPE_ATTRIBUTE), $inputType);
301  if (strlen($value) > 0) {
302  $highlighted = $query->htmlFragmentHighlightMatches(strip_tags($value), 'UTF-8');
303  $matches = array();
304  if (preg_match($highlightedRegex, $highlighted, $matches)) {
305  $hit = $matches[3];
306  $highlighted = preg_replace($highlightedRegex, ' <em class="highlighted">$3</em> ', $highlighted);
307  $highlighted = trim(preg_replace('/&#13;|[\n\r\t]/', ' ', $highlighted));
308  $excerpt = StringUtil::excerpt($highlighted, $hit, 300, '');
309  $summary = $excerpt;
310  break;
311  }
312  }
313  }
314  $results[$oid] = array(
315  'oid' => $oid,
316  'score' => $hit->score,
317  'summary' => $summary
318  );
319  }
320  }
321  catch (Exception $ex) {
322  // do nothing, return empty result
323  }
324  }
325  return $results;
326  }
327 }
328 
329 class Analyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive {
330  /**
331  * Override method to make sure we are using utf-8
332  */
333  public function setInput($data, $encoding = '')
334  {
335  parent::setInput($data, 'UTF-8');
336  }
337 }
338 
339 /**
340  * Standard german/english stop words taken from Lucene's StopAnalyzer
341  */
342 $GLOBALS['STOP_WORDS'] = <<<'EOD'
343 ein
344 einer
345 eine
346 eines
347 einem
348 einen
349 der
350 die
351 das
352 dass
353 daß
354 du
355 er
356 sie
357 es
358 was
359 wer
360 wie
361 wir
362 und
363 oder
364 ohne
365 mit
366 am
367 im
368 in
369 aus
370 auf
371 ist
372 sein
373 war
374 wird
375 ihr
376 ihre
377 ihres
378 als
379 für
380 von
381 mit
382 dich
383 dir
384 mich
385 mir
386 mein
387 sein
388 kein
389 durch
390 wegen
391 wird
392 a
393 an
394 and
395 are
396 as
397 at
398 be
399 but
400 by
401 for
402 if
403 in
404 into
405 is
406 it
407 no
408 not
409 of
410 on
411 or
412 s
413 such
414 t
415 that
416 the
417 their
418 then
419 there
420 these
421 they
422 this
423 to
424 was
425 will
426 with
427 EOD;
428 ?>
static optimizeIndex()
static commitIndex($optimize=true)
error($message, $category)
Definition: class.Log.php:69
static find($searchTerm, &$pagingInfo)
static getIndex($create=true)
static encodeValue($value, $inputType)
debug($message, $category)
Definition: class.Log.php:39
static deleteFromSearch(&$obj)
static isActivated()
const DATATYPE_ATTRIBUTE
static getStopWords()
setInput($data, $encoding= '')
mkdirRec($dirname)
static $isActivated
static getIndexPath()
$GLOBALS['STOP_WORDS']
static resetIndex()
static $indexIsDirty
static indexInSearch(&$obj)
This class provides access to the search based on Zend_Search_Lucene. The search index stored in the ...
excerpt($text, $phrase, $radius=100)
const INI_INDEX_PATH
const BUILDDEPTH_SINGLE