Indexing and searching in documents using Pylucene - python-3.x

I would like to index some documents and then search them for specific terms and retrieve their position in the documents. I am very unsuccessful in this task as all the examples are in JAVA and more importantly they use older version of lucene which is very different from the current version of lucene.
This is my snippet that creates the index:
import pandas as pd
import operator
import lucene
from java.io import StringReader
from java.io import File
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader, PostingsEnum, IndexOptions, IndexWriter, IndexWriterConfig
from org.apache.lucene.store import FSDirectory, ByteBuffersDirectory
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.util import Version, BytesRefIterator
# Init
if not lucene.getVMEnv():
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
directory = ByteBuffersDirectory()
iconfig = IndexWriterConfig(EnglishAnalyzer())
iwriter = IndexWriter(directory, iconfig)
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ts = ["this bernhard is the text to be index text",
"this claudia is the text to be indexed"]
for t in ts:
doc = Document()
doc.add(Field("content", t, ft))
iwriter.addDocument(doc)
iwriter.commit()
iwriter.close()
This is the part of code that I try to start reading the index to extract the position of a term:
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
searcher.setSimilarity(BM25Similarity(1.2,0.75))
query = QueryParser('content', analyzer).parse("world")
scoreDocs = searcher.search(query, 10).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
## scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 10).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
for scoreDoc in scoreDocs:
print(scoreDoc)
fields = reader.getTermVectors(scoreDoc.doc)
print('fields:', fields.terms('content'))
fieldsIter = fields.iterator()
terms = reader.getTermVector(scoreDoc.doc, "content")
termsIter = terms.iterator();
print('terms.position:', terms.hasPositions())
However, it is incomplete and I do not know how to complete the code. Any help is appreciated.

Related

Keyerror by find method in XML etree child

i create a simple xml structure:
import xml.etree.cElementTree as ET
root = ET.Element("root")
doc = ET.SubElement(root, "doc", name="doc1")
ET.SubElement(doc, "rank").text = "Employee"
ET.SubElement(doc, "skill").text = "nothing"
Now i want to return the values but i get this error: "KeyError: 'Employee'"
for doc in root.findall('doc'):
rank = doc.find('rank').text
skill = doc.find('skill').text
name = doc.get('name')
Logging(name, rank, skill)
The issue is the parameters in the Logging method.
I switch Logging to print and everthing is working.

How to know what collections I have in Firestore using Python API

I am using Python to conect to a firestore database from a client.
The problem is that I don't know how to see what collections he has in the database:
from google.cloud import firestore
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
cred = credentials.Certificate('credentials/credentials.json')
app = firebase_admin.initialize_app(cred)
db = firestore.client()
users_ref = db.collection(u'name_of_colection')
docs = users_ref.stream()
for doc in docs:
print(u'{} => {}'.format(doc.id, doc.to_dict()))
I have been looking how to get the name of the collections that he has but I didn't find anything that it was useful to me. I have also tried this:
cols = db.collections()
list_col = []
for col in cols:
list_col.append(col)
len(list_col)
I have obtained len = 6
Then I have done this for the different col in the list I have generated:
docs = list_col[5].stream()
data = []
for doc in docs:
data.append(doc.to_dict())
print(data)
This data print a dictionary with keys and values, but I don't know only get a list with the name of collections,
I think that you have to get the id from each collection (which is the collection name you are talking about
list_col = []
for col in collections:
list_col.append(col.id) // <-- add this please
print(list_col)
I hope it helps you
Any collection you see in fire base depends on your rights.
you can use
query = client.collection_group('mygroup')
or
query = client.collections()
It gives top hierarchy and you have to run multiple times to find the lowest document level.
query = client.collection_group('mygroup')
#param {string} collectionId Identifies the collections to query over. Every collection or subcollection with this ID as the last segment of its path will be included. Cannot contain a slash. #returns {Query} The created Query.
collections()[source]
List top-level collections of the client’s database.
Returns
iterator of subcollections of the current document.
Return type
Sequence[CollectionReference]
Simple solution:
import firebase_admin
from firebase_admin import firestore
app_options = {'projectId': 'test-project'}
default_app = firebase_admin.initialize_app(options=app_options)
db = firestore.client()
collection = db.collections()
list_col = []
for col in collection:
list_col.append(col.id)
print(list_col)

Mongoengine String Field in List field make unique

I use mongoengine and flask-restplus for API Server.
My model is here.
from datetime import datetime
from mongoengine import *
from config import DB_NAME
connect(DB_NAME)
class Board(Document):
d = datetime.now()
date = "{}-{}-{}".format(d.year, d.month, d.day)
no = SequenceField()
title = StringField(required=True)
body = StringField(required=True)
tags = ListField(StringField(unique=True))
password = StringField(required=True)
created_at = DateTimeField(default=date)
updated_at = DateTimeField(default=date)
I defined tags = ListField(StringField(unique=True)).
Because article can't got duplicated tag.
Insert tag code is here.
tag = ~some user input here~
article = Board.objects.get(no=article_no)
article.tags.append(tag)
article.save()
But when I insert the duplicated data, it stored data twice.
I want to tags has distinct data.
Why unique=True doesn't work?
Is there any solution about this issue?
Here's an explanation why does it not work. As a workaround you could use $addToSet operator provided by Mongo. It is also implemented in Mongoengine(docs) and an example usage would look like this
Board.objects(no=article_no).update_one(add_to_set__tags=tag)
where tag can be string or list of strings

dynamically appending a dict by retrieving list of hierarchy of managers of managers in python ldap3

the method get_employee,gets details of an employee along with its manager and returns the manager, i have got that part working when run separately, then i am tryng to have another method that gets in the manager and retrieves all its managers in hierarchy till the CEO of that particular employee and append it to the same dict. i tried writing the second method get_manager to dynamically add columns to the same dict with manger,manager1,manager2,manager3 etc in a loop.
output of first method
import pandas as pd
import re
import ast
import requests
import json
import ldap3
from ldap3 import Server,Connection
from pprint import pprint
host = "ldap://o.com"
user = "CN=P,OU=Services Accounts,OU=Accounts,DC=f,DC=r,DC=com"
password = "h"
search_base = "dc=o,dc=f,dc=r,dc=com"
search_filter = '(cn=Pillai, Mamatha)'
df=pd.DataFrame([])
attrs = ['cn','givenName','manager','employeeType','title','department']
server = ldap3.Server(host)
with ldap3.Connection(server,user,password) as conn:
conn.search(search_base, search_filter, attributes=attrs)
result=(conn.response_to_json())
pprint(result)
#result=json.load(result)
result=ast.literal_eval(result)
for j in result.get('entries',[]):
dfDICT = {'Name':[j.get('attributes').get('cn',None)],
'transit':[j.get('attributes').get('department',None)],
'Status':[j.get('attributes').get('employeeType',None)],
'title':[j.get('attributes').get('title',None)]
}
if j.get('attributes').get('manager'):
#n=j.get('attributes').get('manager')
#mn=re.search(r'[^CN=][^,]+,[^,]+',(j.get('attributes').get('manager')))
p=(j.get('attributes').get('manager'))
n=(j.get('attributes').get('manager')).replace('\\',"")
mn=n.replace('CN=',"")
m=re.search(r'[^,]+,[^,]+',mn)
if mn is not None:
dfDICT['manager']=m.group(0)
manager=m.group(0)
while True:
search_filter1 = '(cn='+manager+')'
attrs1 = ['manager']
with ldap3.Connection(server,user,password) as conn:
conn.search(search_base, search_filter1, attributes=attrs1)
result2=(conn.response_to_json())
pprint(result2)
#result=json.load(result)
result2=ast.literal_eval(result2)
for i in range():
for k in result2.get('entries',[]):
if k.get('attributes').get('manager'):
#p=(j.get('attributes').get('manager'))
a=(k.get('attributes').get('manager')).replace('\\',"")
b=a.replace('CN=',"")
c=re.search(r'[^,]+,[^,]+',b)
manager=c.group(0)
dfDICT[i]=c.group(0)
if manager=='mp':
break

Django haystack, how to match parts of words?

I'm using haystack 1.2.7 + whoosh 2.4.0 in Django 1.4 (Python is 2.7)
Example: Search query "sear" should match items containing "search" and "sear" and "searching" (etc).
my settings:
HAYSTACK_SITECONF = 'verticalsoftware.search.search_sites'
HAYSTACK_SEARCH_ENGINE = 'whoosh'
HAYSTACK_WHOOSH_PATH = 'C:/whoosh/prodeo_index'
HAYSTACK_INCLUDE_SPELLING = True
search index:
class GalleryIndex(SearchIndex):
text = indexes.CharField(document=True, use_template=True)
content_auto = indexes.NgramField(model_attr='title')
def index_queryset(self):
"""Used when the entire index for model is updated."""
return Gallery.objects.filter(date_added__lte=datetime.datetime.now())
also tried with EdgeNgramField and/or RealTimeSearchIndex
custom urlCONF:
from django.conf.urls.defaults import *
from verticalsoftware.search.views import SearchWithRequest
urlpatterns = patterns('haystack.views',
url(r'^$', SearchWithRequest(), name='haystack_search'),
)
custom view:
from haystack.views import SearchView
import operator
from haystack.query import SearchQuerySet, SQ
class SearchWithRequest(SearchView):
__name__ = 'SearchWithRequest'
def build_form(self, form_kwargs=None):
if form_kwargs is None:
form_kwargs = {}
if self.searchqueryset is None:
sqs = SearchQuerySet().filter(reduce(operator.__or__, [SQ(text=word.strip()) for word in self.request.GET.get("q").split(' ')]))
form_kwargs['searchqueryset'] = sqs
return super(SearchWithRequest, self).build_form(form_kwargs)
for sqs I've tried everything imaginable, using filter and autocomplete as seen in the docs and every relevant forum post I could find; using __startswith and __contains in combination with my content_auto or text field didn't help at all (the latter would not match anything at all; while the former only matched 1 character or the complete string)
the variant pasted above at least has the benefit of returning results for strings with spaces (each word still has to fully match the corresponding database entry, ergo the need for this post)
any help will be IMMENSELY appreciated
late to the party, but suggesting to change your main document field (text) to an EdgeNgramField or NgramField, otherwise the searched index is not capable of matching word fragments, only complete word matching is possible with the CharField.
also, playing in the django shell is sometimes usefull, when debugging haystack:
./manage.py shell
from haystack.query import SearchQuerySet
s = SearchQuerySet()
s.auto_query('sear')
s.auto_query('sear').count()
...

Resources