Source for file helper.php
Documentation is available at helper.php
* @package Joomla.Administrator
* @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
* @license GNU General Public License version 2 or later; see LICENSE
* Helper class for the Finder indexer package.
* @package Joomla.Administrator
* The token stemmer object. The stemmer is set by whatever class
* wishes to use it but it must be an instance of FinderIndexerStemmer.
* @var FinderIndexerStemmer
* Method to parse input into plain text.
* @param string $input The raw input.
* @param string $format The format of the input. [optional]
* @return string The parsed input.
* @throws Exception on invalid parser.
public static function parse($input, $format =
'html')
// Get a parser for the specified format and parse the input.
* Method to tokenize a text string.
* @param string $input The input to tokenize.
* @param string $lang The language of the input.
* @param boolean $phrase Flag to indicate whether input could be a phrase. [optional]
* @return array An array of FinderIndexerToken objects.
public static function tokenize($input, $lang, $phrase =
false)
$store =
JString::strlen($input) <
128 ?
md5($input .
'::' .
$lang .
'::' .
$phrase) :
null;
// Check if the string has been tokenized already.
if ($store && isset
($cache[$store]))
// Get the simple language key.
$lang =
self::getPrimaryLanguage($lang);
* Parsing the string input into terms is a multi-step process.
* 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
* 2. Remove plus, dash, period, and comma characters located before letter characters.
* 3. Remove plus, dash, period, and comma characters located after other characters.
* 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
* 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
* 6. Remove orphaned quote characters.
* 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
* 8. Remove multiple space characters and replaces with a single space.
$input =
preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
$input =
preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
$input =
preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
$input =
preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
$input =
preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
$input =
preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
$input =
preg_replace('#[' .
$quotes .
']+#mui', '\'', $input);
// Explode the normalized string to get the terms.
* If we have Unicode support and are dealing with Chinese text, Chinese
* has to be handled specially because there are not necessarily any spaces
* between the "words". So, we have to test if the words belong to the Chinese
* character set and if so, explode them into single glyphs or "words".
// Iterate through the terms and test if they contain Chinese.
for ($i =
0, $n =
count($terms); $i <
$n; $i++
)
$charCount =
preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
// Split apart any groups of Chinese characters.
for ($j =
0; $j <
$charCount; $j++
)
$terms[] =
$charMatches[0][$j];
* If we have to handle the input as a phrase, that means we don't
* tokenize the individual terms and we do not create the two and three
* term combinations. The phrase must contain more than one word!
if ($phrase ===
true &&
count($terms) >
1)
// Create tokens from the phrase.
// Create tokens from the terms.
for ($i =
0, $n =
count($terms); $i <
$n; $i++
)
// Create two and three word phrase tokens from the individual words.
for ($i =
0, $n =
count($tokens); $i <
$n; $i++
)
// Setup the phrase positions.
// Create the two word phrase.
if ($i2 <
$n && isset
($tokens[$i2]))
// Tokenize the two word phrase.
$token =
new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang ===
'zh' ?
'' :
' ');
// Add the token to the stack.
// Create the three word phrase.
if ($i3 <
$n && isset
($tokens[$i3]))
// Tokenize the three word phrase.
$token =
new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang ===
'zh' ?
'' :
' ');
// Add the token to the stack.
* Method to get the base word of a token. This method uses the public
* {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
* the original token is returned.
* @param string $token The token to stem.
* @param string $lang The language of the token.
* @return string The root token.
public static function stem($token, $lang)
// Trim apostrophes at either end of the token.
// Trim everything after any apostrophe in the token.
// Stem the token if we have a valid stemmer to use.
if (self::$stemmer instanceof
FinderIndexerStemmer)
return self::$stemmer->stem($token, $lang);
* Method to add a content type to the database.
* @param string $title The type of content. For example: PDF
* @param string $mime The mime type of the content. For example: PDF [optional]
* @return integer The id of the content type.
* @throws Exception on database error.
$db =
JFactory::getDbo();
$query =
$db->getQuery(true);
// Check if the types are loaded.
// Build the query to get the types.
->from($db->quoteName('#__finder_types'));
$types =
$db->loadObjectList('title');
// Check if the type already exists.
if (isset
($types[$title]))
return (int)
$types[$title]->id;
->insert($db->quoteName('#__finder_types'))
->columns(array($db->quoteName('title'), $db->quoteName('mime')))
->values($db->quote($title) .
', ' .
$db->quote($mime));
return (int)
$db->insertid();
* Method to check if a token is common in a language.
* @param string $token The token to test.
* @param string $lang The language to reference.
* @return boolean True if common, false otherwise.
public static function isCommon($token, $lang)
// Load the common tokens for the language if necessary.
if (!isset
($data[$lang]))
$data[$lang] =
self::getCommonWords($lang);
// Check if the token is in the common array.
* Method to get an array of common terms for a language.
* @param string $lang The language to use.
* @return array Array of common terms.
* @throws Exception on database error.
// Create the query to load all the common terms for the language.
$query =
$db->getQuery(true)
->select($db->quoteName('term'))
->from($db->quoteName('#__finder_terms_common'))
->where($db->quoteName('language') .
' = ' .
$db->quote($lang));
// Load all of the common terms for the language.
$results =
$db->loadColumn();
* Method to get the default language for the site.
* @return string The default language string.
// We need to go to com_languages to get the site default language, it's the best we can guess.
* Method to parse a language/locale key and return a simple language string.
* @param string $lang The language/locale key. For example: en-GB
* @return string The simple language string. For example: en
// Only parse the identifier if necessary.
if (!isset
($data[$lang]))
if (is_callable(array('Locale', 'getPrimaryLanguage')))
// Get the language key using the Locale package.
$data[$lang] =
Locale::getPrimaryLanguage($lang);
// Get the language key using string position.
* Method to get the path (SEF route) for a content item.
* @param string $url The non-SEF route to the content item.
* @return string The path for the content item.
// Only get the router once.
// Get and configure the site router.
$router->setMode($config->get('sef', 1));
// Build the relative route.
$uri =
$router->build($url);
$route =
$uri->toString(array('path', 'query', 'fragment'));
* Method to get extra data for a content before being indexed. This is how
* we add Comments, Tags, Labels, etc. that should be available to Finder.
* @param FinderIndexerResult &$item The item to index as an FinderIndexerResult object.
* @return boolean True on success, false on failure.
* @throws Exception on database error.
// Get the event dispatcher.
// Load the finder plugin group.
$results =
$dispatcher->trigger('onPrepareFinderContent', array(&$item));
// Check the returned results. This is for plugins that don't throw
// exceptions when they encounter serious errors.
throw
new Exception($dispatcher->getError(), 500);
// Handle a caught exception.
* Method to process content text using the onContentPrepare event trigger.
* @param string $text The content to process.
* @param JRegistry $params The parameters object. [optional]
* @return string The processed content.
// Load the content plugins if necessary.
// Instantiate the parameter object if necessary.
$registry->loadString($params);
// Create a mock content object.
// Fire the onContentPrepare event.
$dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
Documentation generated on Tue, 19 Nov 2013 15:04:20 +0100 by phpDocumentor 1.4.3