ElasticSearch query stops working with big amount of data - search

The problem: I have 2 identical in terms of settings and mappings indexes.
The first index contains only 1 document.
The second index contains the same document + 16M of others.
When I'm running the query on the first index it returns the document, but when I do the same query on the second — I receive nothing.
Indexes settings:
{
"tasks_test": {
"settings": {
"index": {
"analysis": {
"analyzer": {
"tag_analyzer": {
"filter": [
"lowercase",
"tag_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"tag_filter": {
"type": "word_delimiter",
"type_table": "# => ALPHA"
}
}
},
"creation_date": "1444127141035",
"number_of_replicas": "2",
"number_of_shards": "5",
"uuid": "wTe6WVtLRTq0XwmaLb7BLg",
"version": {
"created": "1050199"
}
}
}
}
}
Mappings:
{
"tasks_test": {
"mappings": {
"Task": {
"dynamic": "false",
"properties": {
"format": "dateOptionalTime",
"include_in_all": false,
"type": "date"
},
"is_private": {
"type": "boolean"
},
"last_timestamp": {
"type": "integer"
},
"name": {
"analyzer": "tag_analyzer",
"type": "string"
},
"project_id": {
"include_in_all": false,
"type": "integer"
},
"user_id": {
"include_in_all": false,
"type": "integer"
}
}
}
}
}
The document:
{
"_index": "tasks_test",
"_type": "Task",
"_id": "1",
"_source": {
"is_private": false,
"name": "135548- test with number",
"project_id": 2,
"user_id": 1
}
}
The query:
{
"query": {
"filtered": {
"query": {
"bool": {
"must": [
[
{
"match": {
"_all": {
"query": "135548",
"type": "phrase_prefix"
}
}
}
]
]
}
},
"filter": {
"bool": {
"must": [
{
"term": {
"is_private": false
}
},
{
"terms": {
"project_id": [
2
]
}
},
{
"terms": {
"user_id": [
1
]
}
}
]
}
}
}
}
}
Also, some findings:
if I replace _all with name everything works
if I replace match_phrase_prefix with match_phrase works too
ES version: 1.5.1
So, the question is: how to make the query work for the second index without mentioned hacks?

Related

Adding index to mappings in MongoDB

Hey I have a problem with my mongoDb search. I need to add one more index to my search for better result. But since its not in mappings whenever i perform my search as following it returns me nothing.
`
embeddedDocument: {
path: 'produces',
operator: {
compound: {
"must": [
{
autocomplete: {
"path": "produces.name",
"query": val,
"fuzzy": {
"maxEdits": 2,
"prefixLength": 3
}
}
},
{
equals: { path: 'produces.deleted', value: false }
}
],
}
}
}
and here is my index definition
{
"mappings": {
"dynamic": false,
"fields": {
"companyName": [
{
"analyzer": "lucene.standard",
"type": "string"
},
{
"foldDiacritics": true,
"maxGrams": 6,
"minGrams": 2,
"tokenization": "edgeGram",
"type": "autocomplete"
}
],
"produces": {
"type": "embeddedDocuments",
"fields": {
"name": [
{
"analyzer": "lucene.standard",
"type": "string"
},
{
"foldDiacritics": true,
"maxGrams": 6,
"minGrams": 2,
"tokenization": "edgeGram",
"type": "autocomplete"
}
]
}
},
"status": {
"type": "string"
}
}
}
}
So i need to update my index definition as following.
{
"mappings": {
"dynamic": false,
"fields": {
"companyName": [
{
"analyzer": "lucene.standard",
"type": "string"
},
{
"foldDiacritics": true,
"maxGrams": 6,
"minGrams": 2,
"tokenization": "edgeGram",
"type": "autocomplete"
}
],
"produces": {
"type": "embeddedDocuments",
"fields": {
"deleted": {
"type": "boolean"
},
"name": [
{
"analyzer": "lucene.standard",
"type": "string"
},
{
"foldDiacritics": true,
"maxGrams": 6,
"minGrams": 2,
"tokenization": "edgeGram",
"type": "autocomplete"
}
]
}
},
"status": {
"type": "string"
}
}
}
}
`
But my problem is i dont know where to add this index definition on the mongoDB side so i can filter with deleted false criteria.
I tried to add index to mongoDB to produces collection but it doesnt work and i still get empty result.

How to do elasticsearch aggregation together with sort and find duplicate values

I want to find duplicate values and if there are duplicate values then I sort based on the last update, so what I take is the newest one, how do I do aggregations? I've tried this aggregation.
I've tried adding sort to sources but it still doesn't work, I've tried several ways but it still fails sometimes it comes out 1 but only old data, sometimes the order is correct from the newest but appears 2 data
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"BILLING_TYPE_CD": "Service Bundle"
}
},
{
"match": {
"ID": "xxxx"
}
},
{
"exists": {
"field": "LI_MILESTONE"
}
},
{
"exists": {
"field": "LI_SID"
}
},
{
"query_string": {
"default_field": "LI_SID",
"query": "*xxxx*"
}
}
],
"must_not": {
"bool": {
"must": [
{
"query_string": {
"default_field": "LI_PRODUCT_NAME",
"query": "*Network*"
}
},
{
"terms": {
"LI_MILESTONE.keyword": [
"Abandoned",
"Cancelled"
]
}
},
{
"terms": {
"ORDER_STATUS.keyword": [
"Abandoned",
"Cancelled",
"Drop In Progress"
]
}
},
{
"term": {
"STATUS.keyword": ""
}
}
]
}
}
}
},
"sort": [
{
"TGL_CREATED": {
"order": "desc"
}
}
],
"aggs": {
"list_products": {
"composite": {
"size": 50000,
"sources": [
{
"LI_SID": {
"terms": {
"field": "LI_SID.keyword",
"order": "desc"
}
}
}
]
},
"aggs": {
"totalService": {
"terms": {
"field": "LI_SID.keyword",
"size": 50000,
"order": {
"_term": "asc"
}
}
},
"bucket_sort": {
"bucket_sort": {
"from": 0,
"size": 10
}
},
"includes_source": {
"top_hits": {
"size": 1,
"_source": {
"includes": [
"LAST_UPDATE",
"xxxxx",
"xxxxx",
"xxxxx",
"xxx"
]
}
}
}
}
},
"term_product": {
"terms": {
"field": "LI_SID.keyword",
"size": 50000
}
}
}
}
Like this ?
{
"aggs": {
"LI_SID": {
"terms": {
"field": "LI_SID.keyword",
"size": 10
},
"aggs": {
"hit": {
"top_hits": {
"size": 1,
"sort": [
{
"LAST_UPDATE": "desc"
}
]
}
}
}
}
},
"size": 0
}
You need to use aggregations response not hits

Return the documents if the array field length greater than 0 in esclient nodejs

I have millions of documents in my es index.
I wanted to fetch the documents where the array field length greater than 0.
My docs looks like this
[
{
"primaryKey": "9c30d9e8-af04-4cc8-afcb-0c1311988c1e",
"language": "all",
"industry": [
"Accounting & auditing"
],
"text": "what's the status of my incident?",
"textId": "d0c70fc4-5e2a-4cab-a5f6-32339e6632dd",
"extractions": [],
"active": true,
"status": "active",
"createdAt": 1620208485092,
"updatedAt": 1620208485092,
"secondaryKey": "5db5f725-ec09-49da-9507-7bb2f94fd741"
},
{
"primaryKey": "9c30d9e8-af04-4cc8-afcb-0c1311988c1e",
"language": "all",
"industry": [
"Accounting"
],
"text": "What is the rating of my incident",
"textId": "4a53533f-293e-440c-aaa9-f7e5ae1436ca",
"extractions": [
{
"name": "Abinas Patra",
"role": "api-user",
"primaryKey": "ed12851d-c18d-4c92-8cc3-1782e41bc9d0"
},
{
"name": "Anil Patra",
"role": "ui-user",
"primaryKey": "933fad33-78b3-4779-a7bd-c62c6e02af75"
}
],
"active": true,
"status": "active",
"createdAt": 1620208485092,
"updatedAt": 1620208485092,
"secondaryKey": "5db5f725-ec09-49da-9507-7bb2f94fd741"
}
]
I am using elasticsearch nodejs client.
I tried in the below way
let dataCount = await esClient.count({
index: "indexName",
type: "docType",
body: {
query: {
bool: {
must: [
{
"script": {
"script": {
"inline": "doc['extractions'].values.length > 0",
"lang": "painless"
}
}
},
{
"match": {
"primaryKey": {
query: primaryKey,
"operator": "and"
}
}
},
{
"match": {
"language": {
query: language,
"operator": "and"
}
}
}
]
}
}
}
});
I get runtime parsing error everytime, i tried with exist field as well.
{"error":{"root_cause":[{"type":"script_exception","reason":"runtime error","script_stack":["org.elasticsearch.search.lookup.LeafDocLookup.get(LeafDocLookup.java:65)","org.elasticsearch.search.lookup.LeafDocLookup.get(LeafDocLookup.java:27)","doc[\'extractions\'].values.length > 1"," ^---- HERE"],
tried this as well
must_not:[
{
"script": {
"script": "_source.extractions.size() > 0"
}
}
]
Can anyone please help here.
thanks :)

How to apply filter on geo coordinates in elasticsearch?

I am using elasticsearch with mongoosastic npm module. I am trying to apply filter on geo coordinates having following model structure
geoLocation: {
type: {
type: String,
default: 'Point'
},
coordinates: [Number] //orders should be lat,lng
}
with the mapping as follows
{
"events": {
"settings": {
"analysis": {
"filter": {
"edgeNGram_filter": {
"type": "edgeNGram",
"min_gram": 1,
"max_gram": 50,
"side": "front"
}
},
"analyzer": {
"edge_nGram_analyzer": {
"type": "custom",
"tokenizer": "edge_ngram_tokenizer",
"filter": [
"lowercase",
"asciifolding",
"edgeNGram_filter"
]
},
"whitespace_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "50",
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings": {
"event": {
"_all": {
"index_analyzer": "nGram_analyzer",
"search_analyzer": "whitespace_analyzer"
},
"properties": {
"title": {
"type": "string",
"index": "not_analyzed"
},
"geoLocation": {
"index": "not_analyzed",
"type": "geo_point"
}
}
}
}
}
}
Query
{
"query": {
"multi_match": {
"query": "the",
"fields": ["title", ]
}
},
"filter" : {
"geo_distance" : {
"distance" : "200km",
"geoLocation.coordinates" : {
"lat" : 19.007452,
"lon" : 72.831556
}
}
}
}
I am unable to create indexing on the geo coordinates with the following model structure, I dont understand if it is not possible to index geo coordinates with the above model structure because in my case the coordinates has order as lat,long and I have found somewhere that elasticsearch expects coordinates order as long,lat.
Error
Error: SearchPhaseExecutionException[Failed to execute phase [query],
all shards failed; shardFailures {[CDHdgtJnTbeu8tl2mDfllg][events][0]:
SearchParseException[[events][0]: from[-1],size[-1]: Parse Failure
[Failed to parse source
curl -XGET localhost:9200/events
{
"events": {
"aliases": {},
"mappings": {
"1": {
"properties": {
"location": {
"type": "double"
},
"text": {
"type": "string"
}
}
},
"event": {
"properties": {
"city": {
"type": "string"
},
"endTime": {
"type": "date",
"format": "dateOptionalTime"
},
"geo_with_lat_lon": {
"type": "geo_point",
"lat_lon": true
},
"isActive": {
"type": "boolean"
},
"isRecommended": {
"type": "boolean"
},
"location": {
"type": "string"
},
"title": {
"type": "string"
}
}
}
},
"settings": {
"index": {
"creation_date": "1461675012489",
"uuid": "FT-xVUdPQtyuKFm4J4Rd7g",
"number_of_replicas": "1",
"number_of_shards": "5",
"events": {
"mappings": {
"event": {
"_all": {
"enabled": "false",
"search_analyzer": "whitespace_analyzer",
"index_analyzer": "nGram_analyzer"
},
"properties": {
"geoLocation": {
"coordinates": {
"type": "geo_shape",
"index": "not_analyzed"
}
},
"location": {
"type": "string",
"index": "not_analyzed"
},
"title": {
"type": "string",
"index": "not_analyzed"
},
"geo_with_lat_lon": {
"type": "geo_point",
"lat_lon": "true",
"index": "not_analyzed"
}
}
}
},
"settings": {
"analysis": {
"analyzer": {
"edge_nGram_analyzer": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding",
"edgeNGram_filter"
],
"tokenizer": "edge_ngram_tokenizer"
},
"whitespace_analyzer": {
"type": "custom",
"filter": [
"lowercase",
"asciifolding"
],
"tokenizer": "whitespace"
}
},
"filter": {
"edgeNGram_filter": {
"max_gram": "50",
"type": "edgeNGram",
"min_gram": "1",
"side": "front"
}
},
"tokenizer": {
"edge_ngram_tokenizer": {
"max_gram": "50",
"type": "edgeNGram",
"min_gram": "1",
"token_chars": [
"letter",
"digit"
]
}
}
}
}
},
"version": {
"created": "1070099"
}
}
},
"warmers": {}
}
}
I got a solution for my question
Mapping
PUT /geo_test
{
"mappings": {
"type_test": {
"properties": {
"name": {
"type": "string"
},
"geoLocation": {
"type": "nested",
"properties": {
"coordinates": {
"type": "geo_point",
"lat_lon": true
}
}
}
}
}
}
}
Query
POST /geo_test/type_test/_search
{
"query": {
"filtered": {
"filter": {
"nested": {
"path": "geoLocation",
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 5,
"distance_unit": "km",
"geoLocation.coordinates": {
"lat": 41.12,
"lon": -71.34
}
}
}
}
}
}
}
}
}
}

Elasticsearch use custom analyzer on filter

I was asking on elasticsearch nested filter return empty result about some error I have in the query and wont getting any results, but in the answer I was pointed out that the expression I use for the filter wasn't analyzed as I expect.
I have a custom analyzer to do the work how can I specify in the next query to the filter to use this custom analyzer:
GET /develop/_search?search_type=dfs_query_then_fetch
{
"query": {
"filtered" : {
"query": {
"bool": {
"must": [
{ "match": { "title": "post" }}
]
}
},
"filter": {
"bool": {
"must": [
{"term": {
"featured": 0
}},
{
"nested": {
"path": "seller",
"filter": {
"bool": {
"must": [
{ "term": { "seller.firstName": "Test 3" } }
]
}
},
"_cache" : true
}}
]
}
}
}
},
"sort": [
{
"_score":{
"order": "desc"
}
},{
"created": {
"order": "desc"
}
}
],
"track_scores": true
}
Here is a setup that seems to do what you want. I used the same basic code as the last answer, but used index_analyzer and search_analyzer in the index definition as follows:
curl -XDELETE "http://localhost:9200/my_index"
curl -XPUT "http://localhost:9200/my_index" -d'
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"snowball": { "type": "snowball", "language": "English" },
"english_stemmer": { "type": "stemmer", "language": "english" },
"english_possessive_stemmer": { "type": "stemmer", "language": "possessive_english" },
"stopwords": { "type": "stop", "stopwords": [ "_english_" ] },
"worddelimiter": { "type": "word_delimiter" }
},
"tokenizer": {
"nGram": { "type": "nGram", "min_gram": 3, "max_gram": 20 }
},
"analyzer": {
"custom_analyzer": {
"type": "custom",
"tokenizer": "nGram",
"filter": [
"stopwords",
"asciifolding",
"lowercase",
"snowball",
"english_stemmer",
"english_possessive_stemmer",
"worddelimiter"
]
},
"custom_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"stopwords",
"asciifolding",
"lowercase",
"snowball",
"english_stemmer",
"english_possessive_stemmer",
"worddelimiter"
]
}
}
}
},
"mappings": {
"posts": {
"properties": {
"title": {
"type": "string",
"analyzer": "custom_analyzer",
"boost": 5
},
"seller": {
"type": "nested",
"properties": {
"firstName": {
"type": "string",
"index_analyzer": "custom_analyzer",
"search_analyzer": "custom_search_analyzer",
"boost": 3
}
}
}
}
}
}
}'
Then added the test docs
curl -XPUT "http://localhost:9200/my_index/posts/1" -d'
{"title": "post", "seller": {"firstName":"Test 1"}}'
curl -XPUT "http://localhost:9200/my_index/posts/2" -d'
{"title": "post", "seller": {"firstName":"Test 2"}}'
curl -XPUT "http://localhost:9200/my_index/posts/3" -d'
{"title": "post", "seller": {"firstName":"Test 3"}}'
And then a couple of match queries in a bool, where one is a multiword query, seems to accomplish what you are wanting:
curl -XPOST "http://localhost:9200/my_index/_search" -d'
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "post"
}
},
{
"nested": {
"path": "seller",
"query": {
"match": {
"seller.firstName": {
"query": "Test 3",
"operator": "and"
}
}
}
}
}
]
}
}
}'
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 6.8380365,
"hits": [
{
"_index": "my_index",
"_type": "posts",
"_id": "3",
"_score": 6.8380365,
"_source": {
"title": "post",
"seller": {
"firstName": "Test 3"
}
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/8cd954aa60be8c44f64e4282e15e6b565c945ecb
Does that solve your problem?

Resources