Source for file indexer.php
Documentation is available at indexer.php
* @package Joomla.Administrator
* @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
* @license GNU General Public License version 2 or later; see LICENSE
* Main indexer class for the Finder indexer package.
* The indexer class provides the core functionality of the Finder
* search engine. It is responsible for adding and updating the
* content links table; extracting and scoring tokens; and maintaining
* all referential information for the content.
* Note: All exceptions thrown from within this class should be caught
* @package Joomla.Administrator
* The title context identifier.
* The text context identifier.
* The meta context identifier.
* The path context identifier.
* The misc context identifier.
* The indexer state object.
* The indexer profiler object.
* Returns a reference to the FinderIndexer object.
* @return FinderIndexer instance based on the database driver
* @throws RuntimeException if driver class for indexer not present.
// Setup the adapter for the indexer.
elseif ($format ==
'sqlazure')
$path = __DIR__ .
'/driver/' .
$format .
'.php';
$class =
'FinderIndexerDriver' .
ucfirst($format);
// Check if a parser exists for the format.
// Instantiate the parser.
// Throw invalid format exception.
throw
new RuntimeException(JText::sprintf('COM_FINDER_INDEXER_INVALID_DRIVER', $format));
* Method to get the indexer state.
* @return object The indexer state object.
// First, try to load from the internal state.
if (!empty(self::$state))
// If we couldn't load from the internal state, try the session.
$session =
JFactory::getSession();
$data =
$session->get('_finder.state', null);
// If the state is empty, load the values for the first time.
// Load the default configuration options.
// Setup the weight lookup information.
self::TITLE_CONTEXT =>
round($data->options->get('title_multiplier', 1.7), 2),
self::TEXT_CONTEXT =>
round($data->options->get('text_multiplier', 0.7), 2),
self::META_CONTEXT =>
round($data->options->get('meta_multiplier', 1.2), 2),
self::PATH_CONTEXT =>
round($data->options->get('path_multiplier', 2.0), 2),
self::MISC_CONTEXT =>
round($data->options->get('misc_multiplier', 0.3), 2)
// Set the current time as the start time.
// Set the remaining default values.
$data->batchSize = (int)
$data->options->get('batch_size', 50);
$data->pluginState =
array();
// Setup the profiler if debugging is enabled.
self::$profiler =
JProfiler::getInstance('FinderIndexer');
if ($data->options->get('stem', 1) &&
$data->options->get('stemmer', 'porter_en'))
FinderIndexerHelper::$stemmer =
FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
* Method to set the indexer state.
* @param object $data A new indexer state object.
* @return boolean True on success, false on failure.
// Check the state object.
if (empty($data) ||
!$data instanceof
JObject)
// Set the new internal state.
// Set the new session state.
$session =
JFactory::getSession();
$session->set('_finder.state', $data);
* Method to reset the indexer state.
// Reset the internal state to null.
// Reset the session state to null.
$session->set('_finder.state', null);
* Method to index a content item.
* @param FinderIndexerResult $item The content item to index.
* @param string $format The format of the content. [optional]
* @return integer The ID of the record in the links table.
* @throws Exception on database error.
abstract public function index($item, $format =
'html');
* Method to remove a link from the index.
* @param integer $linkId The id of the link.
* @return boolean True on success.
* @throws Exception on database error.
abstract public function remove($linkId);
* Method to optimize the index. We use this method to remove unused terms
* and any other optimizations that might be necessary.
* @return boolean True on success.
* @throws Exception on database error.
* Method to get a content item's signature.
* @param object $item The content item to index.
* @return string The content item's signature.
// Get the indexer state.
$state =
self::getState();
// Get the relevant configuration variables.
$config[] =
$state->weights;
$config[] =
$state->options->get('stem', 1);
$config[] =
$state->options->get('stemmer', 'porter_en');
* Method to parse input, tokenize it, and then add it to the database.
* @param mixed $input String or resource to use as input. A resource
* input will automatically be chunked to conserve
* memory. Strings will be chunked if longer than
* @param integer $context The context of the input. See context constants.
* @param string $lang The language of the input.
* @param string $format The format of the input.
* @return integer The number of tokens extracted from the input.
protected function tokenizeToDB($input, $context, $lang, $format)
// If the input is a resource, batch the process out.
// Batch the process out to avoid memory limits.
$buffer .=
fread($input, 2048);
* If we haven't reached the end of the file, seek to the last
* space character and drop whatever is after that to make sure
* we didn't truncate a term while reading the input.
// Find the last space character.
// Adjust string based on the last space character.
// Truncate the string to the last space character.
$string =
substr($buffer, 0, $ls);
// Adjust the buffer based on the last space for the next iteration and trim.
// No space character was found.
// We've reached the end of the file, so parse whatever remains.
// Add the tokens to the database.
// Check if we're approaching the memory limit of the token table.
if ($count >
self::$state->options->get('memory_table_limit', 30000))
// If the input is greater than 2K in size, it is more efficient to
// batch out the operation into smaller chunks of work.
elseif (strlen($input) >
2048)
* As it turns out, the complex regular expressions we use for
* sanitizing input are not very efficient when given large
* strings. It is much faster to process lots of short strings.
$string =
substr($input, $start, $chunk);
// Find the last space character if we aren't at the end.
$ls =
(($start +
$chunk) <
$end ?
strrpos($string, ' ') :
false);
// Truncate to the last space character.
$string =
substr($string, 0, $ls);
// Adjust the start position for the next iteration.
$start +=
($ls !==
false ?
($ls +
1 -
$chunk) +
$chunk :
$chunk);
// Add the tokens to the database.
// Check if we're approaching the memory limit of the token table.
if ($count >
self::$state->options->get('memory_table_limit', 30000))
// Add the tokens to the database.
* Method to add a set of tokens to the database.
* @param mixed $tokens An array or single FinderIndexerToken object.
* @param mixed $context The context of the tokens. See context constants. [optional]
* @return integer The number of tokens inserted into the database.
* @throws Exception on database error.
abstract protected function addTokensToDB($tokens, $context =
'');
* Method to switch the token tables from Memory tables to MyISAM tables
* when they are close to running out of memory.
* @param boolean $memory Flag to control how they should be toggled.
* @return boolean True on success.
* @throws Exception on database error.
Documentation generated on Tue, 19 Nov 2013 15:05:31 +0100 by phpDocumentor 1.4.3