Elasticsearch Geo shape search against nested documents

Elasticsearch Geo shape search against nested documents - nested

I would like to be able to search against documents using a geo_shape filter (where the geojson is a nested subdocument) and only retrieve the 1 sub document that matched the geographical filter (not the whole document). I think the docs (specifically nested object type and nested query/filter docs) state this is possible using join:false. For some reason I can't get it to work though and I'm convinced it's a user error or lack of understanding.
On ES 90.5 and below is a worked example.
Can someone point me in the right direction please?
Thanks
Clear the deck and create a new index
curl -XDELETE http://es_server.com:9200/test
{"ok":true,"acknowledged":true}
curl -XPUT http://es_server.com:9200/test
{"ok":true,"acknowledged":true}
Set a new mapping for the testtype
curl -XPUT http://es_server.com:9200/test/testtype/_mapping -d '{
"testtype": {
"properties": {
"entities": {
"type": "nested",
"properties": {
"geometry": {
"tree": "quadtree",
"type": "geo_shape",
"precision": "10m"
}
}
}
}
}
}'
{"ok":true,"acknowledged":true}
Index a new document
curl -XPUT http://es_server.com:9200/test/testtype/doc1 -d '{
"id": "doc1",
"entities": [
{
"geometry": {
"type": "Point",
"coordinates": [
0.0,
0.0
]
}
},
{
"geometry": {
"type": "Point",
"coordinates": [
180.0,
90.0
]
}
}
]
}'
Query WITH join:false
curl -XGET http://es_server.com:9200/test/testtype/_search -d '{
"query": {
"filtered": {
"filter": {
"nested": {
"path": "entities",
"join": false,
"filter": {
"geo_shape": {
"entities.geometry": {
"shape": {
"type": "envelope",
"coordinates": [
[
-10.0,
10.0
],
[
10.0,
-10.0
]
]
}
}
}
}
}
},
"query": {
"match_all": {}
}
}
}
}'
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 0,
"max_score": null,
"hits": []
}
}
Query WITHOUT join:false
curl -XGET http://es_server.com:9200/test/testtype/_search -d '{
"query": {
"filtered": {
"filter": {
"nested": {
"path": "entities",
"filter": {
"geo_shape": {
"entities.geometry": {
"shape": {
"type": "envelope",
"coordinates": [
[
-10.0,
10.0
],
[
10.0,
-10.0
]
]
}
}
}
}
}
},
"query": {
"match_all": {}
}
}
}
}'
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.0,
"hits": [
{
"_index": "test",
"_type": "testtype",
"_id": "doc1",
"_score": 1.0,
"_source": {
"id": "doc1",
"entities": [
{
"geometry": {
"type": "Point",
"coordinates": [
0.0,
0.0
]
}
},
{
"geometry": {
"type": "Point",
"coordinates": [
180.0,
90.0
]
}
}
]
}
}
]
}
}

Related

Elasticsearch aggrecation give me 2 results insted of one result

I want to aggregate on the brand field and is give me two results instead of one
The brands_aggs give me from this text
{name : "Brand 1"}
2 results
Brand and 1
But Why I need only Brand 1
is separate the word brand and 1 from (Brand 1)
and is give me 2 results in the aggrecation
my mappings where I want to aggregate
mapping = {
"mappings": {
"product": {
"properties": {
"categories": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"fielddata": True
}
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"fielddata": True
}
}
}
}
}
my post request
{
"query" : {
"bool": {
"must": [
{"match": { "categories": "AV8KW5Wi31qHZdVeXG4G" }}
]
}
},
"size" : 0,
"aggs" : {
"brand_aggs" : {
"terms" : { "field" : "brand" }
},
"categories_aggs" : {
"terms" : { "field" : "categories" }
}
}
}
response from the server
{
"took": 18,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"categories_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "av8kw5wi31qhzdvexg4g",
"doc_count": 1
},
{
"key": "av8kw61c31qhzdvexg4h",
"doc_count": 1
},
{
"key": "av8kxtch31qhzdvexg4a",
"doc_count": 1
}
]
},
"brand_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1", <==== I dont need this , why is give me that ??
"doc_count": 1
},
{
"key": "brand",
"doc_count": 1
}
]
},
}
}

Your mapping has property fields which is used when you want to have multiple analyzers for the same field. In your case valid name of your field is 'brand.keyword'. When you call your aggregate for just 'brand' it use default mapping defined for string.
So your query should be:
{
"query" : {
"bool": {
"must": [
{"match": { "categories": "AV8KW5Wi31qHZdVeXG4G" }}
]
}
},
"size" : 0,
"aggs" : {
"brand_aggs" : {
"terms" : { "field" : "brand.keyword" }
},
"categories_aggs" : {
"terms" : { "field" : "categories.keyword" }
}
}
}
Property field is useful when you want for example search the same property which multiple analyzers, for example:
"full_name": {
"type": "text",
"analyzer": "standard",
"boost": 1,
"fields": {
"autocomplete": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"standard":{
"type": "text",
"analyzer": "standard"
}
}
},

You need to map your string as not_analyzed string, for that run the below query
PUT your_index/_mapping/your_type
{
"your_type": {
"properties": {
"brand": {
"type": "string",
"index": "analyzed",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
Don't forget to replace the your_type and your_index with your type and index values.

ElasticSearch Highlighting fails on match queries against index with Ngram Analyzer

I have created an index with ngram analyzer set on all fields in the index and custom _all. After indexing few documents, I am trying to query against the index to have suggestion like feature.
The output of the query does return results but they are not highlighted.
Analyzer Settings:
"analysis": {
"analyzer": {
"my_edgegram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "my_edge_tokenizer"
}
},
"tokenizer": {
"my_edge_tokenizer": {
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram": "3",
"type": "ngram",
"max_gram": "26"
}
}
}
Mapping:
{
"st1": {
"mappings": {
"a": {
"_all": {
"enabled": false
},
"dynamic_templates": [
{
"catch_all": {
"match": "imp*",
"match_mapping_type": "string",
"mapping": {
"analyzer": "my_edgegram_analyzer",
"copy_to": "catch_all",
"norms": false,
"type": "text"
}
}
}
],
"properties": {
"catch_all": {
"type": "text",
"store": true,
"analyzer": "my_edgegram_analyzer"
},
"imp_server_id": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
},
"imp_server_name": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
}
}
},
"b": {
"_all": {
"enabled": false
},
"dynamic_templates": [
{
"catch_all": {
"match": "imp*",
"match_mapping_type": "string",
"mapping": {
"analyzer": "my_edgegram_analyzer",
"copy_to": "catch_all",
"norms": false,
"type": "text"
}
}
}
],
"properties": {
"catch_all": {
"type": "text",
"store": true,
"analyzer": "my_edgegram_analyzer"
},
"imp_server_id": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
},
"imp_server_name": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
}
}
}
}
}
}
Documents:
http://localhost:9200/st1/b/1
{"imp_server_name":"abc1-4-jam9.my.test.com","imp_server_id":"vrock2-us"}
http://localhost:9200/st1/a/1
{"imp_server_name":"abc2-5-ajm9.my.test.com","imp_server_id":"vrock2-us"}
Query:
{
"query": {
"match": {
"catch_all": {
"query":"test",
"analyzer": "keyword"
}
}
},
"highlight": {
"pre_tags": ["<b>"],
"post_tags": ["</b>"],
"fields": {
"*": {}
},
"require_field_match": false
}
}
Response:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.16292635,
"hits": [
{
"_index": "st1",
"_type": "a",
"_id": "1",
"_score": 0.16292635,
"_source": {
"imp_server_name": "abc2-5-ajm9.my.test.com",
"imp_server_id": "vrock2-us"
},
"highlight": {
"imp_server_name": [
"abc2-5-ajm9.my.test.com"
],
"catch_all": [
"abc2-5-ajm9.my.test.com"
]
}
},
{
"_index": "st1",
"_type": "b",
"_id": "1",
"_score": 0.16292635,
"_source": {
"imp_server_name": "abc1-4-jam9.my.test.com",
"imp_server_id": "vrock2-us"
},
"highlight": {
"imp_server_name": [
"abc1-4-jam9.my.test.com"
],
"catch_all": [
"abc1-4-jam9.my.test.com"
]
}
}
]
}
}
How can I make highlight work in the above scenario. Below is the expected output:
Expected output:
"highlight": {
"imp_server_name": [
"abc2-5-ajm9.my.<b>test</b>.com"
],
"catch_all": [
"abc2-5-ajm9.my.<b>test</b>.com"
]
}

I was able to get the results by setting term_vector in the schema.
"term_vector": "with_positions_offsets"

ElasticSearch : GeoLocation distance search across types

As shown below, there are two types in my city index - zoo and hotel. How do I find all zoos having a hotel in 1KM radius? Here is the mapping of my index :
GET /city/_mapping
{
"city": {
"mappings": {
"hotel": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
},
"zoo": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
}
}
}
}

You can do it with a geo-distance filter for the whole index (just don't specify a type).
As I quick test I created an index like this:
PUT /test_index/
{
"mappings": {
"hotel": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
},
"zoo": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
}
}
}
Added a couple of documents
POST /test_index/_bulk
{"index":{"_type":"hotel","_id":1}}
{"name":"hotel1","location":{"lat" : 40.001, "lon" : -70.001}}
{"index":{"_type":"zoo","_id":1}}
{"name":"zoo1","location":{"lat" : 40.002, "lon" : -70.002}}
And then I can search like this. This query returns the one document:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 200,
"distance_unit": "km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "hotel",
"_id": "1",
"_score": 1,
"_source": {
"name": "hotel1",
"location": {
"lat": 40.001,
"lon": -70.001
}
}
}
]
}
}
And this query returns both:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 300,
"distance_unit": "km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "hotel",
"_id": "1",
"_score": 1,
"_source": {
"name": "hotel1",
"location": {
"lat": 40.001,
"lon": -70.001
}
}
},
{
"_index": "test_index",
"_type": "zoo",
"_id": "1",
"_score": 1,
"_source": {
"name": "zoo1",
"location": {
"lat": 40.002,
"lon": -70.002
}
}
}
]
}
}
Here's the code I used to test it:
http://sense.qbox.io/gist/948d23a5327cf5f22dd368146f37d09e30765fee

Elastic(search): Get docs with max and min timestamp values

I got a problem with a search I just can't figure out how to do it. My docs are of the following form:
{
"timestamp":"2015-03-17T15:05:04.563Z",
"session_id":"1",
"user_id":"jan"
}
Let's say the first timestamp of a session id is the "Login" and the last timestamp is the "Logout". I want to have all "login" and "logout" docs for all sessions (if possible sorted by user_id). I managed to get the right timestamps with aggregations:
{
"aggs" : {
"group_by_uid" : {
"terms" : {
"field" : "user_id"
},
"aggs" : {
"group_by_sid" : {
"terms" : {
"field" : "session_id"
},
"aggs" : {
"max_date" : {
"max": { "field" : "timestamp" }
},
"min_date" : {
"min": { "field" : "timestamp" }
}
}
}
}
}
}
}
But how do I get the corresponding docs? I also don't mind if i have to do 2 searches (one for the logins and one for the logouts). I tried tome top hits aggregations and sorting stuff but I always get parse errors :/
I hope someone can give me a hint :)
Best regards,
Jan

Here's a solution in a single search based on the approach proposed by Sloan Ahrens. The advantage is that the start and end session entries are in the same bucket.
{
"aggs": {
"group_by_uid": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_by_sid": {
"terms": {
"field": "session_id"
},
"aggs": {
"session_start": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "asc" } } ]
}
},
"session_end": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "desc" } } ]
}
}
}
}
}
}
}
}
Cheers,
Jan

You're already close. How about this. Use two searches, each aggregating the way you did, but then also get the first top_hit sorting on "timestamp".
I just set up a basic index and added some data that looks like what you posted:
PUT /test_index
{
"settings": {
"number_of_shards": 1
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"timestamp":"2015-03-17T15:05:04.563Z","session_id":"1","user_id":"jan"}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"timestamp":"2015-03-17T15:10:04.563Z","session_id":"1","user_id":"jan"}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"timestamp":"2015-03-17T15:15:04.563Z","session_id":"1","user_id":"jan"}
{"index":{"_index":"test_index","_type":"doc","_id":4}}
{"timestamp":"2015-03-17T18:05:04.563Z","session_id":"1","user_id":"bob"}
{"index":{"_index":"test_index","_type":"doc","_id":5}}
{"timestamp":"2015-03-17T18:10:04.563Z","session_id":"1","user_id":"bob"}
{"index":{"_index":"test_index","_type":"doc","_id":6}}
{"timestamp":"2015-03-17T18:15:04.563Z","session_id":"1","user_id":"bob"}
Then I can get each session's start time with:
POST /test_index/_search?search_type=count
{
"aggs": {
"group_by_uid": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_by_sid": {
"terms": {
"field": "session_id"
},
"aggs": {
"session_start": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "asc" } } ]
}
}
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"group_by_uid": {
"buckets": [
{
"key": "bob",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_start": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"timestamp": "2015-03-17T18:05:04.563Z",
"session_id": "1",
"user_id": "bob"
},
"sort": [
1426615504563
]
}
]
}
}
}
]
}
},
{
"key": "jan",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_start": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"timestamp": "2015-03-17T15:05:04.563Z",
"session_id": "1",
"user_id": "jan"
},
"sort": [
1426604704563
]
}
]
}
}
}
]
}
}
]
}
}
}
and end-time with:
POST /test_index/_search?search_type=count
{
"aggs": {
"group_by_uid": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_by_sid": {
"terms": {
"field": "session_id"
},
"aggs": {
"session_end": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "desc" } } ]
}
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"group_by_uid": {
"buckets": [
{
"key": "bob",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_end": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "6",
"_score": null,
"_source": {
"timestamp": "2015-03-17T18:15:04.563Z",
"session_id": "1",
"user_id": "bob"
},
"sort": [
1426616104563
]
}
]
}
}
}
]
}
},
{
"key": "jan",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_end": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"timestamp": "2015-03-17T15:15:04.563Z",
"session_id": "1",
"user_id": "jan"
},
"sort": [
1426605304563
]
}
]
}
}
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/05edb48b840e6a992646643913db8ef0a3ccccb3

Elasticsearch use custom analyzer on filter

I was asking on elasticsearch nested filter return empty result about some error I have in the query and wont getting any results, but in the answer I was pointed out that the expression I use for the filter wasn't analyzed as I expect.
I have a custom analyzer to do the work how can I specify in the next query to the filter to use this custom analyzer:
GET /develop/_search?search_type=dfs_query_then_fetch
{
"query": {
"filtered" : {
"query": {
"bool": {
"must": [
{ "match": { "title": "post" }}
]
}
},
"filter": {
"bool": {
"must": [
{"term": {
"featured": 0
}},
{
"nested": {
"path": "seller",
"filter": {
"bool": {
"must": [
{ "term": { "seller.firstName": "Test 3" } }
]
}
},
"_cache" : true
}}
]
}
}
}
},
"sort": [
{
"_score":{
"order": "desc"
}
},{
"created": {
"order": "desc"
}
}
],
"track_scores": true
}

Here is a setup that seems to do what you want. I used the same basic code as the last answer, but used index_analyzer and search_analyzer in the index definition as follows:
curl -XDELETE "http://localhost:9200/my_index"
curl -XPUT "http://localhost:9200/my_index" -d'
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"snowball": { "type": "snowball", "language": "English" },
"english_stemmer": { "type": "stemmer", "language": "english" },
"english_possessive_stemmer": { "type": "stemmer", "language": "possessive_english" },
"stopwords": { "type": "stop", "stopwords": [ "_english_" ] },
"worddelimiter": { "type": "word_delimiter" }
},
"tokenizer": {
"nGram": { "type": "nGram", "min_gram": 3, "max_gram": 20 }
},
"analyzer": {
"custom_analyzer": {
"type": "custom",
"tokenizer": "nGram",
"filter": [
"stopwords",
"asciifolding",
"lowercase",
"snowball",
"english_stemmer",
"english_possessive_stemmer",
"worddelimiter"
]
},
"custom_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"stopwords",
"asciifolding",
"lowercase",
"snowball",
"english_stemmer",
"english_possessive_stemmer",
"worddelimiter"
]
}
}
}
},
"mappings": {
"posts": {
"properties": {
"title": {
"type": "string",
"analyzer": "custom_analyzer",
"boost": 5
},
"seller": {
"type": "nested",
"properties": {
"firstName": {
"type": "string",
"index_analyzer": "custom_analyzer",
"search_analyzer": "custom_search_analyzer",
"boost": 3
}
}
}
}
}
}
}'
Then added the test docs
curl -XPUT "http://localhost:9200/my_index/posts/1" -d'
{"title": "post", "seller": {"firstName":"Test 1"}}'
curl -XPUT "http://localhost:9200/my_index/posts/2" -d'
{"title": "post", "seller": {"firstName":"Test 2"}}'
curl -XPUT "http://localhost:9200/my_index/posts/3" -d'
{"title": "post", "seller": {"firstName":"Test 3"}}'
And then a couple of match queries in a bool, where one is a multiword query, seems to accomplish what you are wanting:
curl -XPOST "http://localhost:9200/my_index/_search" -d'
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "post"
}
},
{
"nested": {
"path": "seller",
"query": {
"match": {
"seller.firstName": {
"query": "Test 3",
"operator": "and"
}
}
}
}
}
]
}
}
}'
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 6.8380365,
"hits": [
{
"_index": "my_index",
"_type": "posts",
"_id": "3",
"_score": 6.8380365,
"_source": {
"title": "post",
"seller": {
"firstName": "Test 3"
}
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/8cd954aa60be8c44f64e4282e15e6b565c945ecb
Does that solve your problem?

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Elasticsearch Geo shape search against nested documents - nested

Related

Elasticsearch aggrecation give me 2 results insted of one result

ElasticSearch Highlighting fails on match queries against index with Ngram Analyzer

ElasticSearch : GeoLocation distance search across types

Elastic(search): Get docs with max and min timestamp values

Elasticsearch use custom analyzer on filter

Categories

Resources