'Attachments', 'description' => 'Administer Apache Solr Attachments.', 'page callback' => 'apachesolr_attachments_admin_page', 'access arguments' => array('administer search'), 'file' => 'apachesolr_attachments.admin.inc', 'type' => MENU_LOCAL_TASK, ); $items['admin/config/search/apachesolr/attachments/test'] = array( 'title' => 'Test tika extraction', 'page callback' => 'apachesolr_attachments_test_tika_extraction', 'access arguments' => array('administer search'), 'file' => 'apachesolr_attachments.admin.inc', 'type' => MENU_CALLBACK, ); $items['admin/config/search/apachesolr/attachments/confirm/delete'] = array( 'title' => 'Delete and reindex all files', 'page callback' => 'drupal_get_form', 'page arguments' => array('apachesolr_attachments_confirm', 6), 'access arguments' => array('administer search'), 'file' => 'apachesolr_attachments.admin.inc', 'type' => MENU_CALLBACK, ); $items['admin/config/search/apachesolr/attachments/confirm/clear-cache'] = array( 'title' => 'Delete the local cache of file text', 'page callback' => 'drupal_get_form', 'page arguments' => array('apachesolr_attachments_confirm', 6), 'access arguments' => array('administer search'), 'file' => 'apachesolr_attachments.admin.inc', 'type' => MENU_CALLBACK, ); return $items; } /** * @file * Indexer for the userhook_apachesolr_entity_info_alter entities for the Apachesolr module. */ function apachesolr_attachments_apachesolr_entity_info_alter(&$entity_info) { $entity_info['file']['indexable'] = TRUE; $entity_info['file']['status callback'][] = 'apachesolr_attachments_status_callback'; $entity_info['file']['document callback'][] = 'apachesolr_attachments_solr_document'; $entity_info['file']['reindex callback'] = 'apachesolr_attachments_solr_reindex'; $entity_info['file']['index_table'] = 'apachesolr_index_entities_file'; $entity_info['file']['result callback'] = 'apachesolr_attachments_file_result'; } /** * Builds the file-specific information for a Solr document. * * @param ApacheSolrDocument $document * The Solr document we are building up. * @param stdClass $entity * The entity we are indexing. * @param string $entity_type * The type of entity we're dealing with. */ function apachesolr_attachments_solr_document(ApacheSolrDocument $document, $file, $entity_type, $env_id) { module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); $documents = array(); $table = apachesolr_get_indexer_table('file'); // Text is saved in the index table. Will be used by the node indexing if // available. $text = apachesolr_attachments_get_attachment_text($file); // If we don't have extracted text we should stop our process here if (empty($text)) { return $documents; } // Get the list of parents that we should index from the indexing table $parents = db_select($table, 'aie') ->fields('aie') ->condition('entity_type', 'file') ->condition('entity_id', $file->fid) ->execute(); foreach ($parents as $parent) { // load the parent entity and reset cache $parent_entities = entity_load($parent->parent_entity_type, array($parent->parent_entity_id), NULL, TRUE); $parent_entity = reset($parent_entities); // Skip invalid entities if (empty($parent_entity)) { continue; } // Retrieve the parent entity id and bundle list($parent_entity_id, $parent_entity_vid, $parent_entity_bundle) = entity_extract_ids($parent->parent_entity_type, $parent_entity); $parent_entity_type = $parent->parent_entity_type; // Get a clone of the bare minimum document $filedocument = clone $document; //Get the callback array to add stuff to the document $callbacks = apachesolr_entity_get_callback($parent_entity_type, 'document callback'); // Skip invalid entity types if (empty($callbacks)) { continue; } $build_documents = array(); foreach ($callbacks as $callback) { // Call a type-specific callback to add stuff to the document. $build_documents = array_merge($build_documents, $callback($filedocument, $parent_entity, $parent_entity_type, $env_id)); } // Take the top document from the stack $filedocument = reset($build_documents); // Build our separate document and overwrite basic information $filedocument->id = apachesolr_document_id($file->fid . '-' . $parent_entity_type . '-' . $parent_entity_id, $entity_type); $filedocument->url = file_create_url($file->uri); $path = file_stream_wrapper_get_instance_by_uri($file->uri)->getExternalUrl(); // A path is not a requirement of an entity if (!empty($path)) { $filedocument->path = $path; } // Add extra info to our document $filedocument->label = apachesolr_clean_text($file->filename); $filedocument->content = apachesolr_clean_text($file->filename) . ' ' . $text; $filedocument->ds_created = apachesolr_date_iso($file->timestamp); $filedocument->ds_changed = $filedocument->ds_created; $filedocument->created = apachesolr_date_iso($file->timestamp); $filedocument->changed = $filedocument->created; // Add Parent information fields. See http://drupal.org/node/1515822 for explanation $parent_entity_info = entity_get_info($parent_entity_type); $small_parent_entity = new stdClass(); $small_parent_entity->entity_type = $parent_entity_type; $small_parent_entity->{$parent_entity_info['entity keys']['id']} = $parent_entity_id; $small_parent_entity->{$parent_entity_info['entity keys']['bundle']} = $parent_entity_bundle; $small_parent_entity->{$parent_entity_info['entity keys']['label']} = $parent_entity->{$parent_entity_info['entity keys']['label']}; // Add all to one field because if it is spread out over // multiple fields there is no way of knowing which multifield value // belongs to which entity // It does not load the complete entity in to the index because that // would dramatically increase the index size and processing time $filedocument->zm_parent_entity = drupal_json_encode($small_parent_entity); $filedocument->sm_parent_entity_bundle = $parent_entity_type . "-" . $parent_entity_bundle; $filedocument->sm_parent_entity_type = $parent_entity_type; // Add Apachesolr Attachments specific fields. $filedocument->ss_filemime = $file->filemime; $filedocument->ss_filesize = $file->filesize; $documents[] = $filedocument; } return $documents; } /** * Reindexing callback for ApacheSolr, for file entities. */ function apachesolr_attachments_solr_reindex() { module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); $indexer_table = apachesolr_get_indexer_table('file'); $transaction = db_transaction(); $env_id = apachesolr_default_environment(); try { // Clean the table db_delete($indexer_table) ->condition('entity_type', 'file') ->isNull('body') ->execute(); $files = _apachesolr_attachments_get_all_files(); // If we do not have files, return success if (empty($files)) { return TRUE; } // Loop over all the files and add them to our indexing table foreach ($files as $parent_entity_type => $parent_entities) { foreach ($parent_entities as $parent_entity_info) { // Fake our file class $file = new stdClass(); foreach ($parent_entity_info->extraFields as $key => $value) { if (strpos($key, '_fid')) { $file->fid = $parent_entity_info->extraFields->$key; } } list ($parent_entity_id) = entity_extract_ids($parent_entity_type, $parent_entity_info); apachesolr_attachments_add_file_usage($file, $parent_entity_type, $parent_entity_id); } } } catch (Exception $e) { $transaction->rollback(); drupal_set_message($e->getMessage(), 'error'); watchdog_exception('Apache Solr Attachments', $e); return FALSE; } return TRUE; } /** * Fetches all files linked to nodes created by fields from the file module * regardless of the widget * @return type */ function _apachesolr_attachments_get_all_files() { $results = array(); $fields = field_info_field_by_ids(); foreach ($fields as $field_id => $field_info) { if ($field_info['type'] == 'file') { foreach ($field_info['bundles'] as $entity_type => $bundles) { $entity_info = entity_get_info($entity_type); // If this entity type is not indexable, ignore this and move on to the // next one if (empty($entity_info['apachesolr']['indexable'])) { continue; } $query = new ApachesolrAttachmentsEntityFieldQuery(); $results_query = $query ->entityCondition('entity_type', $entity_type) ->fieldCondition($field_info['field_name']) // Fetch all file ids related to the entities ->addExtraField($field_info['field_name'], 'fid', 'fid') ->execute(); $results = array_merge_recursive($results, $results_query); } } } return $results; } /** * Status callback for the files. Files should never be removed from the table. * See apachesolr_attachments_apachesolr_exclude() for exclusion of items * @param type $entity_id * @param type $entity_type * @return type */ function apachesolr_attachments_status_callback($entity_id, $entity_type) { module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); // load the entity and reset cache $entities = entity_load($entity_type, array($entity_id), NULL, TRUE); $entity = reset($entities); // Check if the mimetype is allowed if (apachesolr_attachments_allowed_mime($entity->filemime) == FALSE) { // Set status to 0 and remove from the index return FALSE; } // Check if the file is a real file if (apachesolr_attachments_is_file($entity) == FALSE) { // Set status to 0 and remove from the index return FALSE; } // Check if the entity status is active if ($entity->status != 1) { // Set status to 0 and remove from the index return FALSE; } // Keep status at 1 return TRUE; } /** * Implenents hook_apachesolr_ENTITY_TYPE_exclude(). * * This is invoked for each file entity that is being inspected to be added to the * index. if any module returns TRUE, the entity is skipped for indexing. * * @param integer $entity_id * @param integer $row * A complete set of data from the indexing table. * @param string $env_id * @return boolean */ function apachesolr_attachments_apachesolr_file_exclude($entity_id, $row, $env_id) { module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); // Make sure we have a boolean value. // Anything different from 1 becomes zero if (!$entity_id || !$row->parent_entity_id) { // Exclude return TRUE; } // Check if the parent entity is excluded $parent_entity_id = $row->parent_entity_id; $parent_entity_type = $row->parent_entity_type; $exclude = apachesolr_attachments_is_parent_excluded($entity_id, 'file', $parent_entity_id, $parent_entity_type, $env_id); if ($exclude) { // Exclude return TRUE; } // Do not exclude return FALSE; } function apachesolr_attachments_is_file($entity) { if (!empty($entity->uri)) { $filepath = drupal_realpath($entity->uri); // Check that we have a valid filepath. if (!$filepath) { return FALSE; } elseif (!is_file($filepath)) { watchdog('Apache Solr Attachments', '%filepath is not a valid file path', array('%filepath' => $entity->uri), WATCHDOG_WARNING); return FALSE; } else { return TRUE; } } return FALSE; } /** * Excludes a file if the parent_entity is set to status 0 or it is not * being indexed * @param type $entity_id * @param type $entity_type * @return type */ function apachesolr_attachments_is_parent_excluded($entity_id, $entity_type, $parent_entity_id, $parent_entity_type, $env_id) { $query = new EntityFieldQuery(); $result = $query ->entityCondition('entity_type', $parent_entity_type) ->entityCondition('entity_id', $parent_entity_id) ->execute(); // We only need the class and 1 item; if (empty($result)) { // Parent entity id does not exist anymore return TRUE; } $values = array_values($result[$parent_entity_type]); // Since we only expect 1 eid to return, we are going to do // a reset of the array $stub_entity = reset($values); $parent_entity_bundle = $stub_entity->type; // Ignore this parent if the bundles to be indexed for this entity type // are not indexed $bundles = apachesolr_get_index_bundles($env_id, $parent_entity_type); if (empty($bundles)) { // Exclude it return TRUE; } else if (!in_array($parent_entity_bundle, $bundles)) { // Exclude it return TRUE; } // Skip indexing of files if the node was excluded by apache solr $status_callbacks = apachesolr_entity_get_callback($parent_entity_type, 'status callback'); if (!empty($status_callbacks)) { // Set status to true. Allow the callbacks to make the change $status = TRUE; // Check status callback before sending to the index foreach ($status_callbacks as $status_callback) { if (is_callable($status_callback)) { // by placing $status in front we prevent calling any other callback // after one status callback returned false $status = $status && $status_callback($parent_entity_id, $parent_entity_type); } } // TRUE means the status is ok. We should return FALSE so it does // not exclude return !$status; } // Exclude by default return TRUE; } /** * For a particular entity, remove all file attachments from the Solr index. * This function is not in use in the module but can come in handy for people * that prefer to use functions * * @see apachesolr_delete_node_from_index() */ function apachesolr_attachments_remove_attachments_from_index($parent_entity_type, $parent_entity) { static $failed = FALSE; if ($failed) { return FALSE; } try { // Retrieve the parent entity id and bundle list($parent_entity_id, $parent_entity_vid, $parent_entity_bundle) = entity_extract_ids($parent_entity_type, $parent_entity); $solr = apachesolr_get_solr(); $solr->deleteByQuery("sm_parent_entity:{$parent_entity_type}-{$parent_entity_bundle}-{$parent_entity_id} AND entity_type:file AND hash:" . apachesolr_site_hash()); $solr->commit(); return TRUE; } catch (Exception $e) { watchdog('Apache Solr Attachments', nl2br(check_plain($e->getMessage())), NULL, WATCHDOG_ERROR); // Don't keep trying queries if they are failing. $failed = TRUE; return FALSE; } } /** * Implements hook_apachesolr_query_alter(). */ function apachesolr_attachments_apachesolr_query_alter(DrupalSolrQueryInterface $query) { if ($query->getName() == 'apachesolr') { // Fetch the extra file data on searches. $query->addParam('fl', array('zm_parent_entity', 'ss_filemime', 'ss_file_entity_title', 'ss_file_entity_url')); } elseif ($query->getName() == 'apachesolr_mlt') { // Exclude files from MLT results. $query->addFilter('entity_type', 'file', TRUE); } } /* * hook_entity_OP functions need to happen to clean up the table after files * were added to the index table */ function apachesolr_attachments_entity_update($entity, $type) { module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); apachesolr_attachments_clean_index_table(); } function apachesolr_attachments_entity_insert($entity, $type) { apachesolr_attachments_entity_update($entity, $type); } function apachesolr_attachments_entity_delete($entity, $type) { module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); apachesolr_attachments_clean_index_table(); } /** * Hook into the field operations * - we want to save the same data in a shadow copy table for easier indexing. * - We do not delete the file / media entity when its usage count goes to 0 * but instead we set status to 0. * - This is meant to make the backport to drupal 6 easier */ function apachesolr_attachments_field_attach_insert($parent_entity_type, $parent_entity) { apachesolr_attachments_field_attach_update($parent_entity_type, $parent_entity); } function apachesolr_attachments_field_attach_update($parent_entity_type, $parent_entity) { // Not all entities sent through this function have the same // syntax and/or content. We only check entities that have a type. // This excludes comments, so if there would be a file entity atttached to // a comment, this would not be picked up. // TODO: If you are a entity magician, please improve if (isset($parent_entity->type)) { // Check if the deleted entity had a file attached foreach (field_info_instances($parent_entity_type, $parent_entity->type) as $instance) { $field_info = field_info_field($instance['field_name']); if ($field_info['type'] == 'file') { // Include the file after the if, otherwise it'll get included everywhere module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); $items = field_get_items($parent_entity_type, $parent_entity, $field_info['field_name']); foreach ($items as $file_info) { $file = file_load($file_info['fid']); // Discard empty entities if (empty($file)) { continue; } // Retrieve parent entity id and add its file usage list($parent_entity_id) = entity_extract_ids($parent_entity_type, $parent_entity); apachesolr_attachments_add_file_usage($file, $parent_entity_type, $parent_entity_id); } } } } } function apachesolr_attachments_field_attach_delete($parent_entity_type, $parent_entity) { // Not all entities sent through this function have the same // syntax and/or content. We only check entities that have a type. // This excludes comments, so if there would be a file entity atttached to // a comment, this would not be picked up. // TODO: If you are a entity magician, please improve if (isset($parent_entity->type)) { // Check if the deleted entity had a file attached foreach (field_info_instances($parent_entity_type, $parent_entity->type) as $instance) { $field_info = field_info_field($instance['field_name']); if ($field_info['type'] == 'file') { // Include the file after the if, otherwise it'll get included everywhere module_load_include('inc', 'apachesolr_attachments', 'apachesolr_attachments.index'); $items = field_get_items($parent_entity_type, $parent_entity, $field_info['field_name']); foreach ($items as $file_info) { $file = file_load($file_info['fid']); // Discard empty entities if (empty($file)) { continue; } // Retrieve parent entity id and delete its file usage list($parent_entity_id) = entity_extract_ids($parent_entity_type, $parent_entity); apachesolr_attachments_delete_file_usage($file, $parent_entity_type, $parent_entity_id); } } } } } /** * Callback function for file search results. * * @param stdClass $doc * The result document from Apache Solr. * @param array $result * The result array for this record to which to add. */ function apachesolr_attachments_file_result($doc, &$result, &$extra) { $doc->uid = $doc->is_uid; $result += array( 'type' => t('File attachment'), 'user' => theme('username', array('account' => $doc)), 'date' => isset($doc->created) ? $doc->created : 0, 'node' => $doc, 'file' => $doc, 'uid' => $doc->is_uid, ); } /** * Implements hook_theme(). */ function apachesolr_attachments_theme() { return array( 'apachesolr_search_snippets__file' => array( 'variables' => array('doc' => NULL, 'snippets' => array()), ), ); } /** * @todo Vastly improve this theming function * @param type $vars * @return type */ function theme_apachesolr_search_snippets__file($vars) { $doc = $vars['doc']; $snippets = $vars['snippets']; $parent_entity_links = array(); // Retrieve our parent entities. They have been saved as // a small serialized entity foreach ($doc->zm_parent_entity as $parent_entity_encoded) { $parent_entity = (object) drupal_json_decode($parent_entity_encoded); $parent_entity_uri = entity_uri($parent_entity->entity_type, $parent_entity); $parent_entity_uri['options']['absolute'] = TRUE; $parent_label = entity_label($parent_entity->entity_type, $parent_entity); $parent_entity_links[] = l($parent_label, $parent_entity_uri['path'], $parent_entity_uri['options']); } if (module_exists('file')) { $file_type = t('!icon @filemime', array('@filemime' => $doc->ss_filemime, '!icon' => theme('file_icon', array('file' => (object) array('filemime' => $doc->ss_filemime))))); } else { $file_type = t('@filemime', array('@filemime' => $doc->ss_filemime)); } return implode(' ... ', $snippets) . '' . $file_type . ' attached to:' . implode(', ', $parent_entity_links) . ''; } /** * Provides a default list of filename extensions to exclude from the index. * * @return * An array of file extensions. */ function apachesolr_attachments_default_excluded() { $default = array('aif', 'art', 'avi', 'bmp', 'gif', 'ico', 'jpg', 'mov', 'mp3', 'mp4', 'mpg', 'oga', 'ogv', 'png', 'psd', 'ra', 'ram', 'rgb', 'tif', 'wmv'); return $default; } class ApachesolrAttachmentsEntityFieldQuery extends EntityFieldQuery { // Extra added fields to the query private $addedFields = array(); /** * Finishes the query. * * Adds tags, metaData, range and returns the requested list or count. * * @param SelectQuery $select_query * A SelectQuery which has entity_type, entity_id, revision_id and bundle * fields added. * @param $id_key * Which field's values to use as the returned array keys. * * @return * See EntityFieldQuery::execute(). */ function finishQuery($select_query, $id_key = 'entity_id') { foreach ($this->tags as $tag) { $select_query->addTag($tag); } foreach ($this->metaData as $key => $object) { $select_query->addMetaData($key, $object); } $select_query->addMetaData('entity_field_query', $this); if ($this->range) { $select_query->range($this->range['start'], $this->range['length']); } if ($this->count) { return $select_query->countQuery()->execute()->fetchField(); } $return = array(); foreach($this->addedFields as $addedField) { $fields = $select_query->getFields(); if (!empty($addedField['field_name'])) { $column = $addedField['field_name'] . '_' . $addedField['column']; $column_alias = $addedField['field_name'] . '_' . $addedField['column_alias']; } else { $column = $addedField['column']; $column_alias = $addedField['column_alias']; } $select_query->addField($fields['entity_id']['table'], $column, $column_alias); } foreach ($select_query->execute() as $partial_entity) { $bundle = isset($partial_entity->bundle) ? $partial_entity->bundle : NULL; $entity = entity_create_stub_entity($partial_entity->entity_type, array($partial_entity->entity_id, $partial_entity->revision_id, $bundle)); // This is adding the file id using our metaData field. $entity->extraFields = $partial_entity; //$entity->file_fid = $partial_entity->{$this->metaData['field_key']}; $return[$partial_entity->entity_type][$partial_entity->$id_key] = $entity; $this->ordered_results[] = $partial_entity; } return $return; } public function addExtraField($field_name, $column, $column_alias = NULL) { $this->addedFields[] = array( 'field_name' => $field_name, 'column' => $column, 'column_alias' => $column_alias, ); return $this; } }