Looks like interest to Sitecore implementation of Lucene index has raised since Dream Core event and developers have run into an issue with old data being kept in the index repository. In this article I want to show you how to go around this issue.
First of all let’s see why it’s happening. I ran into this issue when I started playing with new implementation of Lucene index in Sitecore 6. When I created an output of the results I saw duplicates of my data in there. I stated debugging my code and found that Lucene somehow recognizes raw GUID’s which breaks search criteria that Sitecore uses to find items during update/delete procedure.
To solve this issue I had to create additional field for Lucene index (_shorttemplateid) and store there short GUID for an item (item.ID.ToShortID()). Then override AddMatchCriteria method and dependent properties to use short template GUID for matching criteria. Below is the code example.
- namespace LuceneExamples
- {
- public class DatabaseCrawler : Sitecore.Search.Crawlers.DatabaseCrawler
- {
- #region Fields
- private bool _hasIncludes;
- private bool _hasExcludes;
- private Dictionary<string, bool> _templateFilter;
- private ArrayList _customFields;
- #endregion Fields
- #region ctor
- public DatabaseCrawler()
- {
- _templateFilter = new Dictionary<string, bool>();
- _customFields = new ArrayList();
- }
- #endregion ctor
- #region Base class methods
- // Should be overriden to add date fields in "yyyyMMddHHmmss" format. Otherwise it's not possible to create range queries for date values.
- // Also adds _shorttemplateid field which has a template id in ShortID format.
- protected override void AddAllFields(Document document, Item item, bool versionSpecific)
- {
- Assert.ArgumentNotNull(document, "document");
- Assert.ArgumentNotNull(item, "item");
- Sitecore.Collections.FieldCollection fields = item.Fields;
- fields.ReadAll();
- foreach (Sitecore.Data.Fields.Field field in fields)
- {
- if (!string.IsNullOrEmpty(field.Key) && (field.Shared != versionSpecific))
- {
- bool tokenize = base.IsTextField(field);
- if (IndexAllFields)
- {
- if (field.TypeKey == "date" || field.TypeKey == "datetime")
- {
- IndexDateFields(document, field.Key, field.Value);
- }
- else
- {
- document.Add(CreateField(field.Key, field.Value, tokenize, 1f));
- }
- }
- if (tokenize)
- {
- document.Add(CreateField(BuiltinFields.Content, field.Value, true, 1f));
- }
- }
- }
- AddShortTemplateId(document, item);
- AddCustomFields(document, item);
- }
- /// <summary>
- /// Loops through the collection of custom fields and adds them to fields collection of each indexed item.
- /// </summary>
- /// <param name="document">Lucene document</param>
- /// <param name="item">Sitecore data item</param>
- private void AddCustomFields(Document document, Item item)
- {
- foreach(CustomField field in _customFields)
- {
- document.Add(CreateField(field.LuceneFieldName, field.GetFieldValue(item), field.StorageType, field.IndexType, Boost));
- }
- }
- /// <summary>
- /// Creates a Lucene field.
- /// </summary>
- /// <param name="fieldKey">Field name</param>
- /// <param name="fieldValue">Field value</param>
- /// <param name="storeType">Storage option</param>
- /// <param name="indexType">Index type</param>
- /// <param name="boost">Boosting parameter</param>
- /// <returns></returns>
- private Fieldable CreateField(string fieldKey, string fieldValue, Field.Store storeType, Field.Index indexType, float boost)
- {
- Field field = new Field(fieldKey, fieldValue, storeType, indexType);
- field.SetBoost(boost);
- return field;
- }
- /// <summary>
- /// Parses a configuration entry for a custom field and adds it to a collection of custom fields.
- /// </summary>
- /// <param name="node">Configuration entry</param>
- public void AddCustomField(XmlNode node)
- {
- CustomField field = CustomField.ParseConfigNode(node);
- if (field == null)
- {
- throw new InvalidOperationException("Could not parse custom field entry: " + node.OuterXml);
- }
- _customFields.Add(field);
- }
- // Method should use _shorttemplateid to allow one create combined/boolean search queries with template id reference.
- // Also used to create a matching criteria for update/delete actions.
- protected override void AddMatchCriteria(BooleanQuery query)
- {
- query.Add(new TermQuery(new Term(BuiltinFields.Database, Database)), BooleanClause.Occur.MUST);
- query.Add(new TermQuery(new Term(BuiltinFields.Path, Sitecore.Data.ShortID.Encode(Root).ToLowerInvariant())), BooleanClause.Occur.MUST);
- if (HasIncludes || HasExcludes)
- {
- foreach (KeyValuePair<string, bool> pair in TemplateFilter)
- {
- query.Add(new TermQuery(new Term(Constants.ShortTemplate, Sitecore.Data.ShortID.Encode(pair.Key).ToLowerInvariant())), pair.Value ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST_NOT);
- }
- }
- }
- // Method should be overriden because _hasIncludes and _hasExcludes variables were introduced.
- protected override bool IsMatch(Item item)
- {
- bool flag;
- Assert.ArgumentNotNull(item, "item");
- if (!RootItem.Axes.IsAncestorOf(item))
- {
- return false;
- }
- if (!HasIncludes && !HasExcludes)
- {
- return true;
- }
- if (!TemplateFilter.TryGetValue(item.TemplateID.ToString(), out flag))
- {
- return !HasIncludes;
- }
- return flag;
- }
- // Method required to override AddMatchCriteria one.
- new public void IncludeTemplate(string templateId)
- {
- Assert.ArgumentNotNullOrEmpty(templateId, "templateId");
- _hasIncludes = true;
- _templateFilter[templateId] = true;
- }
- // Method required to override AddMatchCriteria one.
- new public void ExcludeTemplate(string templateId)
- {
- Assert.ArgumentNotNullOrEmpty(templateId, "templateId");
- _hasExcludes = true;
- _templateFilter[templateId] = false;
- }
- #endregion Base class methods
- /// <summary>
- /// Converts Sitecore date and datetime fields to the recognizable format for Lucene API.
- /// </summary>
- /// <param name="doc">Lucene document object</param>
- /// <param name="fieldKey">Field name</param>
- /// <param name="fieldValue">Field value</param>
- private void IndexDateFields(Document doc, string fieldKey, string fieldValue)
- {
- DateTime dateTime = Sitecore.DateUtil.IsoDateToDateTime(fieldValue);
- string luceneDate = "";
- if (dateTime != DateTime.MinValue)
- {
- luceneDate = dateTime.ToString(Constants.DateTimeFormat);
- }
- doc.Add(CreateField(fieldKey, luceneDate, false, 1f));
- }
- /// <summary>
- /// Adds template id in ShortID format
- /// </summary>
- /// <param name="doc">Lucene document object</param>
- /// <param name="item">Sitecore item</param>
- private void AddShortTemplateId(Document doc, Item item)
- {
- doc.Add(CreateField(Constants.ShortTemplate, Sitecore.Data.ShortID.Encode(item.TemplateID).ToLowerInvariant(), false, 1f));
- }
- #region Properties
- protected bool HasIncludes
- {
- get
- {
- return _hasIncludes;
- }
- set
- {
- _hasIncludes = value;
- }
- }
- protected bool HasExcludes
- {
- get
- {
- return _hasExcludes;
- }
- set
- {
- _hasExcludes = value;
- }
- }
- protected Dictionary<string, bool> TemplateFilter
- {
- get
- {
- return _templateFilter;
- }
- }
- protected Item RootItem
- {
- get
- {
- return Sitecore.Data.Managers.ItemManager.GetItem(Root, Sitecore.Globalization.Language.Invariant,
- Sitecore.Data.Version.Latest,
- Sitecore.Data.Database.GetDatabase(Database),
- Sitecore.SecurityModel.SecurityCheck.Disable);
- }
- }
- #endregion Properties
- }
- }
This should solve this issue as well as add Lucene recognizable format for Sitecore date and datetime field types. Also it will allow to build Combined and Boolean search queries.
Update. Code for the Constants class:
1: namespace LuceneExamples
2: {
3: public class Constants
4: {
5: // special field for template id in ShortID format
6: public const string ShortTemplate = "_shorttemplateid";
7:
8: // searchable date-time format. All datetime field
9: public const string DateTimeFormat = "yyyyMMddHHmmss";
10:
11: // Path to lucene setting items: /sitecore/system/Settings/Lucene
12: public const string LuceneSettingsPath = "{89783047-026C-45B5-AB5B-338E4A22446C}";
13: }
14: }
Hope it saves someone a minute or two.
