Here are the steps to follow, in order to use the AI OpenNLP models, with Opensolr.
In this example, we will cover how to extract named entities (NER), using the OpenNLP default models.
Please note that at this time, the models are only enabled in the Germany, Solr Version 9 environment. So you should perhaps try to create your index there, when on the Add New Index page in your Opensolr Control Panel.
We can however setup your models in any region, including your own dedicated Opensolr Infrastructure, for Corporate accounts. Simply drop us a note via the Support Helpdesk system, and we'll be happy to enable any models for you.
You can download the OpenNLP default models, from the OpenNLP website, or from Opensolr, here.
<dynamicField name="*_s" type="string" multiValued="true" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" storeOffsetsWithPositions="true" />
<fieldType name="text_nlp" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
<tokenizer class="solr.OpenNLPTokenizerFactory"
sentenceModel="en-sent.bin"
tokenizerModel="en-token.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="en-pos-maxent.bin"/>
<filter class="solr.OpenNLPChunkerFilterFactory" chunkerModel="en-chunker.bin"/>
<filter class="solr.TypeAsPayloadFilterFactory"/>
</analyzer>
</fieldType>
<requestHandler name="/update" class="solr.UpdateRequestHandler" > <lst name="defaults"> <str name="update.chain">nlp</str> </lst> </requestHandler> <updateRequestProcessorChain name="nlp"> <!-- Extract English Language People Names from the fields: title and description, and put them in the people_s multivalued string field --> <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"> <str name="modelFile">en-ner-person.bin</str> <str name="analyzerFieldType">text_nlp</str> <arr name="source"> <str>title</str> <str>description</str> </arr> <str name="dest">people_s</str> </processor> <!-- Extract Spanish Languange People Names from the fields: title and description, and put them in the people_s multivalued string field --> <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"> <str name="modelFile">es-ner-person.bin</str> <str name="analyzerFieldType">text_nlp</str> <arr name="source"> <str>title</str> <str>description</str> </arr> <str name="dest">people_s</str> </processor> <!--Extract Locations--> <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"> <str name="modelFile">en-ner-location.bin</str> <str name="analyzerFieldType">text_nlp</str> <arr name="source"> <str>title</str> <str>description</str> </arr> <str name="dest">location_s</str> </processor> <!--Extract Organizations--> <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"> <str name="modelFile">en-ner-organization.bin</str> <str name="analyzerFieldType">text_nlp</str> <arr name="source"> <str>title</str> <str>description</str> </arr> <str name="dest">organization_s</str> </processor> <!-- Detect the language of each Solr document, based on the data in the title, text, and description fields, using the NLP model: langdetect-183.bin --> <processor class="org.apache.solr.update.processor.OpenNLPLangDetectUpdateProcessorFactory"> <str name="langid.fl">title,text,description</str> <str name="langid.langField">language_s</str> <str name="langid.model">langdetect-183.bin</str> </processor> <!-- Run a De-Dupicator on each target string field, so that we won't with duplicate extracted names organizations, locations, etc, in our string target field. --> <processor class="solr.UniqFieldsUpdateProcessorFactory"> <str name="fieldRegex">.*_s</str> </processor> <processor class="solr.RunUpdateProcessorFactory" /> </updateRequestProcessorChain>
{ "id": "1", "title": "Jack Sparrow was a pirate. Many feared him. He used to live in downtown Las Vegas.", "description": "Jack Sparrow and Janette Sparrowa, are now on their way to Monte Carlo for the summer vacation, after working hard for Microsoft, creating the new and exiciting Windows 11 which everyone now loves. :)", "text": "The Apache OpenNLP project is developed by volunteers and is always looking for new contributors to work on all parts of the project. Every contribution is welcome and needed to make it better. A contribution can be anything from a small documentation typo fix to a new component.Learn more about how you can get involved." }