Elasticsearch aggregate by geo location - search

I want to group data by their location with shapes and bounding boxes. For example group the results by hemisphere:
GET cdc/_search
{
"from": 0,
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"group_by_geo": {
"geo_bounding_box": {
"field": "location",
"boxes": [
{
"top_left": {
"lat": 90,
"lon": -180
},
"bottom_right": {
"lat": 0,
"lon": 0
}
},
{
"top_left": {
"lat": 90,
"lon": 0
},
"bottom_right": {
"lat": 0,
"lon": 180
}
},
{
"top_left": {
"lat": 0,
"lon": -180
},
"bottom_right": {
"lat": -90,
"lon": 0
}
},
{
"top_left": {
"lat": 0,
"lon": 0
},
"bottom_right": {
"lat": -90,
"lon": 180
}
}
]
},
"aggs": {
"over_time": {
"date_histogram": {
"field": "date",
"interval": "day",
"format": "yyyy-MM-dd"
},
"aggs": {
"AvgTemp": {
"avg": {
"field": "hits"
}
}
}
}
}
}
}
}
So the over_timepart works fine isolated but this query gives me "reason": "Could not find aggregator type [geo_bounding_box] in [group_by_geo]"
My syntax is inspired by range aggregations but apparently this is not working in this case.
Is this possible or do I have to make separate queries filtering by the 4 boxes?

There's no geo_bounding_box aggregation, (there's a geo_bounding_box filter, though).
You can achieve what you need using a filters aggregation containing one geo_bounding_box filter for each bounding box. Basically something like this:
{
"from": 0,
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"group_by_geo": {
"filters": {
"filters": {
"box1": {
"geo_bounding_box": {
"location": {
"top_left": {
"lat": 90,
"lon": -180
},
"bottom_right": {
"lat": 0,
"lon": 0
}
}
}
},
"box2": {
"geo_bounding_box": {
"location": {
"top_left": {
"lat": 90,
"lon": 0
},
"bottom_right": {
"lat": 0,
"lon": 180
}
}
}
},
"box3": {
"geo_bounding_box": {
"location": {
"top_left": {
"lat": 0,
"lon": -180
},
"bottom_right": {
"lat": -90,
"lon": 0
}
}
}
},
"box4": {
"geo_bounding_box": {
"location": {
"top_left": {
"lat": 0,
"lon": 0
},
"bottom_right": {
"lat": -90,
"lon": 180
}
}
}
}
}
},
"aggs": {
"over_time": {
"date_histogram": {
"field": "date",
"interval": "day",
"format": "yyyy-MM-dd"
},
"aggs": {
"AvgTemp": {
"avg": {
"field": "hits"
}
}
}
}
}
}
}
}

Related

Opensearch transform splitting array values to new events

I'm transforming index that contains following event.
But the values inside of array are splitting into the new events.
e.g:
"serviceIdentifiers": "Redis"
"serviceIdentifiers":"Event_Detector Servicc"
etc.
{
"_index": "collated_txn_health_2022.05",
"_type": "_doc",
"_id": "LAUpboIBh6CUatILrsN3",
"_score": 1,
"_source": {
"timeInGMT": 0,
"kpiId": 0,
"compInstanceIdentifier": "d0352b7d-0484-4714-bbc8-eb67cbb7be70",
"agentIdentifier": "ComponentAgent-171",
"kpiIdentifier": "PACKETS_DROPPED",
"categoryIdentifier": "Network Utilization",
"applicationIdentifier": null,
"serviceIdentifiers": [
"Supervisor_Controller Service",
"Event_Detector Service",
"UI_Service",
"Redis",
"CC_Service"
],
"clusterIdentifiers": [
"a5c57ef5-4018-41b8-b727-27c8f8376c0e"
],
"collectionInterval": 60,
"value": "0.0",
"kpiType": "Core",
"groupAttribute": "ALL",
"groupIdentifier": null,
"watcherValue": null,
"errorCode": null,
"clusterOperation": null,
"aggLevelInMins": 1,
"error": false,
"kpiGroup": false,
"discovery": false,
"maintenanceExcluded": false,
"#timestamp": "2022-05-01T01:32:00.000Z"
}
Following is the transform job configuration.
curl -u admin:admin -XPUT "http://XXX.XXX.XX.XXX9201/_plugins/_transform/my-array-job-2" -H 'Content-type: application/json' -d'
{
"transform": {
"schedule": {
"interval": {
"start_time": 1659705000000,
"period": 1,
"unit": "Minutes"
}
},
"metadata_id": null,
"updated_at": 1659456180000,
"enabled": true,
"enabled_at": 1659457620000,
"description": "",
"source_index": "collated_txn_health_2022.05",
"data_selection_query": {
"match_all": {
"boost": 1
}
},
"target_index": "transform_collated_txn_health_2022.05",
"page_size": 1000,
"groups": [
{
"date_histogram": {
"fixed_interval": "1m",
"source_field": "#timestamp",
"target_field": "#timestamp",
"timezone": "Asia/Calcutta"
}
},
{
"terms": {
"source_field": "clusterIdentifiers",
"target_field": "clusterIdentifiers"
}
},
{
"terms": {
"source_field": "serviceIdentifiers",
"target_field": "serviceIdentifiers"
}
},
{
"terms": {
"source_field": "compInstanceIdentifier",
"target_field": "compInstanceIdentifier"
}
},
{
"terms": {
"source_field": "agentIdentifier",
"target_field": "agentIdentifier"
}
}
],
"aggregations": {
"count_#timestamp": {
"value_count": {
"field": "#timestamp"
}
}
}
}
}'
Following are the events from the transform index.
{
"_index": "transform_heal_collated_txn_health_2022.05",
"_type": "_doc",
"_id": "ybK0McQ9NZrt9xdo9iWKbA",
"_score": 1,
"_source": {
"transform._id": "my-array-job-2",
"transform._doc_count": 2,
"#timestamp": 1651365120000,
"clusterIdentifiers": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"serviceIdentifiers": "Redis",
"compInstanceIdentifier": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"agentIdentifier": "ComponentAgent-170",
"count_#timestamp": 2
}
},
{
"_index": "transform_heal_collated_txn_health_2022.05",
"_type": "_doc",
"_id": "Wf-4KwnFaYuw9bL-V-9WEQ",
"_score": 1,
"_source": {
"transform._id": "my-array-job-2",
"transform._doc_count": 2,
"#timestamp": 1651365120000,
"clusterIdentifiers": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"serviceIdentifiers": "Redis_Server Service",
"compInstanceIdentifier": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"agentIdentifier": "ComponentAgent-170",
"count_#timestamp": 2
}
It would be a great help if somebody suggest me with solution for array fields.
Have solved the issue with following painless script. Which help to transform array fields in opensearch.
PUT _plugins/_transform/my-array-job-2
{
"transform": {
"schedule": {
"interval": {
"start_time": 1659705000000,
"period": 1,
"unit": "Minutes"
}
},
"metadata_id": null,
"updated_at": 1659456180000,
"enabled": true,
"enabled_at": 1659457620000,
"description": "",
"source_index": "heal_collated_txn_heal_health_2022.05_reindex",
"target_index": "transform_heal_collated_txn_heal_health_2022.05",
"page_size": 1000,
"groups": [
{
"date_histogram": {
"fixed_interval": "1m",
"source_field": "#timestamp",
"target_field": "#timestamp",
"timezone": "Asia/Calcutta"
}
},
{
"terms": {
"source_field": "kpiIdentifier",
"target_field": "kpiIdentifier"
}
},
{
"terms": {
"source_field": "clusterIdentifiers",
"target_field": "clusterIdentifiers"
}
}
],
"aggregations": {
"count_#timestamp": {
"value_count": {
"field": "#timestamp"
}
},
"count_agentIdentifier": {
"value_count": {
"field": "agentIdentifier"
}
},
"sum_value": {
"sum": {
"field": "value"
}
},
"max_value": {
"max": {
"field": "value"
}
},
"avg_value": {
"avg": {
"field": "value"
}
},
"count_value": {
"value_count": {
"field": "value"
}
},
"percentiles_value": {
"percentiles": {
"field": "value",
"percents": [
95
],
"keyed": true,
"tdigest": {
"compression": 100
}
}
},
"serviceIdentifiers": {
"scripted_metric": {
"init_script": "state.docs = []",
"map_script": """
Map span = [
'url':doc['serviceIdentifiers']
];
state.docs.add(span)
""",
"combine_script": "return state.docs;",
"reduce_script": """
def all_docs = [];
for (s in states) {
for (span in s) {
all_docs.add(span);
}
}
def size = all_docs.size();
def serviceIdentifiers_1 = all_docs[0]['url'];
def ret = new HashMap();
ret['serviceIdentifiers'] = serviceIdentifiers_1;
return ret;
"""
}
}
}
}
}

How to do elasticsearch aggregation together with sort and find duplicate values

I want to find duplicate values and if there are duplicate values then I sort based on the last update, so what I take is the newest one, how do I do aggregations? I've tried this aggregation.
I've tried adding sort to sources but it still doesn't work, I've tried several ways but it still fails sometimes it comes out 1 but only old data, sometimes the order is correct from the newest but appears 2 data
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"BILLING_TYPE_CD": "Service Bundle"
}
},
{
"match": {
"ID": "xxxx"
}
},
{
"exists": {
"field": "LI_MILESTONE"
}
},
{
"exists": {
"field": "LI_SID"
}
},
{
"query_string": {
"default_field": "LI_SID",
"query": "*xxxx*"
}
}
],
"must_not": {
"bool": {
"must": [
{
"query_string": {
"default_field": "LI_PRODUCT_NAME",
"query": "*Network*"
}
},
{
"terms": {
"LI_MILESTONE.keyword": [
"Abandoned",
"Cancelled"
]
}
},
{
"terms": {
"ORDER_STATUS.keyword": [
"Abandoned",
"Cancelled",
"Drop In Progress"
]
}
},
{
"term": {
"STATUS.keyword": ""
}
}
]
}
}
}
},
"sort": [
{
"TGL_CREATED": {
"order": "desc"
}
}
],
"aggs": {
"list_products": {
"composite": {
"size": 50000,
"sources": [
{
"LI_SID": {
"terms": {
"field": "LI_SID.keyword",
"order": "desc"
}
}
}
]
},
"aggs": {
"totalService": {
"terms": {
"field": "LI_SID.keyword",
"size": 50000,
"order": {
"_term": "asc"
}
}
},
"bucket_sort": {
"bucket_sort": {
"from": 0,
"size": 10
}
},
"includes_source": {
"top_hits": {
"size": 1,
"_source": {
"includes": [
"LAST_UPDATE",
"xxxxx",
"xxxxx",
"xxxxx",
"xxx"
]
}
}
}
}
},
"term_product": {
"terms": {
"field": "LI_SID.keyword",
"size": 50000
}
}
}
}
Like this ?
{
"aggs": {
"LI_SID": {
"terms": {
"field": "LI_SID.keyword",
"size": 10
},
"aggs": {
"hit": {
"top_hits": {
"size": 1,
"sort": [
{
"LAST_UPDATE": "desc"
}
]
}
}
}
}
},
"size": 0
}
You need to use aggregations response not hits

How to calculate total for each token in Elasticsearch

I have a request into Elastic
{
"query":{
"bool":{
"must":[
{
"query_string":{
"query":"something1 OR something2 OR something3",
"default_operator":"OR"
}
}
],
"filter":{
"range":{
"time":{
"gte":date
}
}
}
}
}
}
I wanna calculate count for each token in all documents using elastic search in one request, for example:
something1: 26 documents
something2: 12 documents
something3: 1 documents
Assuming that the tokens are not akin to enumerations (i.e. constrained set of specific values, like state names, which would make a terms aggregation your best bet with the right mapping), I think the closest thing to what you want would be to use filters aggregation:
POST your-index/_search
{
"query":{
"bool":{
"must":[
{
"query_string":{
"query":"something1 OR something2 OR something3",
"default_operator":"OR"
}
}
],
"filter":{
"range":{
"time":{
"gte":date
}
}
}
}
},
"aggs": {
"token_doc_counts": {
"filters" : {
"filters" : {
"something1" : {
"bool": {
"must": { "query_string" : { "query" : "something1" } },
"filter": { "range": { "time": { "gte": date } } }
}
},
"something2" : {
"bool": {
"must": { "query_string" : { "query" : "something2" } },
"filter": { "range": { "time": { "gte": date } } }
}
},
"something3" : {
"bool": {
"must": { "query_string" : { "query" : "something3" } },
"filter": { "range": { "time": { "gte": date } } }
}
}
}
}
}
}
}
The response would look something like:
{
"took": 9,
"timed_out": false,
"_shards": ...,
"hits": ...,
"aggregations": {
"token_doc_counts": {
"buckets": {
"something1": {
"doc_count": 1
},
"something2": {
"doc_count": 2
},
"something3": {
"doc_count": 3
}
}
}
}
}
You can split your query into filters aggregation of three filters. For reference look here: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-filters-aggregation.html
What you would need to do, is to create a Copy_To field and have the mapping as shown below.
Depending on the fields that your query_string queries, you need to include some or all of the fields with copy_to field.
By default query_string searches all the fields, so you may need to specify copy_to for all the fields as shown in below mapping, where for sake of simplicity, I've created only three fields, title, field_2 and a third field content which would act as copied to field.
Mapping
PUT <your_index_name>
{
"mappings": {
"mydocs": {
"properties": {
"title": {
"type": "text",
"copy_to": "content"
},
"field_2": {
"type": "text",
"copy_to": "content"
},
"content": {
"type": "text",
"fielddata": true
}
}
}
}
}
Sample Documents
POST <your_index_name>/mydocs/1
{
"title": "something1",
"field_2": "something2"
}
POST <your_index_name>/mydocs/2
{
"title": "something2",
"field_2": "something3"
}
Query:
You'd get the required document counts for the each and every token using the below aggregation query and I've made use of Terms Aggregation:
POST <your_index_name>/_search
{
"size": 0,
"query": {
"query_string": {
"query": "something1 OR something2 OR something3"
}
},
"aggs": {
"myaggs": {
"terms": {
"field": "content",
"include" : ["something1","something2","something3"]
}
}
}
}
Query Response:
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"myaggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "something2",
"doc_count": 2
},
{
"key": "something1",
"doc_count": 1
},
{
"key": "something3",
"doc_count": 1
}
]
}
}
}
Let me know if it helps!

Elasticsearch sorting not working properly based on time

I have 20 documents and i'm performing aggregation based on reportid. I need top 10 aggregation based on time in descending. But the response is very random. What am i missing? I'm using elasticsearch 6.2.2 and node.js 4.5. Below here is the body search query for elasticsearch request.
{
"size": 0,
"sort": [
{
"triggerDate":
{
"order": "desc"
}
}],
"query":
{
"bool":
{
"must": [
{
"query_string":
{
"query": "*",
"analyze_wildcard": true
}
},
{
"range":
{
"triggerDate":
{
"gte": fromTime,
"lte": toTime
}
}
}
],
"must_not": [
{
"query_string":
{
"query": "reportId.keyword:\"\"",
"analyze_wildcard": true
}
}]
}
},
"_source":
{
"excludes": []
},
"aggs":
{
"reportid":
{
"terms":
{
"field": "reportId.keyword",
"size": 10
}
}
}
I think what you need to do is aggregate on reportId.keyword and sort aggregation by date.
So here is the solution
{
"size": 0,
"query": {
"bool": {
"must": [
{
"query_string": {
"query": "*",
"analyze_wildcard": true
}
},
{
"range": {
"triggerDate": {
"gte": fromTime,
"lte": toTime
}
}
}
],
"must_not": [
{
"query_string": {
"query": "reportId.keyword:\"\"",
"analyze_wildcard": true
}
}
]
}
},
"_source": {
"excludes": []
},
"aggs": {
"reportid": {
"terms": {
"field": "reportId.keyword",
"size": 10,
"order": {
"2-orderAgg": "desc"
}
},
"aggs": {
"2-orderAgg": {
"max": {
"field": "triggerDate"
}
}
}
}
}
}
You need to sort the aggregation results by a custom aggregation and not the query results.

ElasticSearch : GeoLocation distance search across types

As shown below, there are two types in my city index - zoo and hotel. How do I find all zoos having a hotel in 1KM radius? Here is the mapping of my index :
GET /city/_mapping
{
"city": {
"mappings": {
"hotel": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
},
"zoo": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
}
}
}
}
You can do it with a geo-distance filter for the whole index (just don't specify a type).
As I quick test I created an index like this:
PUT /test_index/
{
"mappings": {
"hotel": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
},
"zoo": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
}
}
}
Added a couple of documents
POST /test_index/_bulk
{"index":{"_type":"hotel","_id":1}}
{"name":"hotel1","location":{"lat" : 40.001, "lon" : -70.001}}
{"index":{"_type":"zoo","_id":1}}
{"name":"zoo1","location":{"lat" : 40.002, "lon" : -70.002}}
And then I can search like this. This query returns the one document:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 200,
"distance_unit": "km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "hotel",
"_id": "1",
"_score": 1,
"_source": {
"name": "hotel1",
"location": {
"lat": 40.001,
"lon": -70.001
}
}
}
]
}
}
And this query returns both:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 300,
"distance_unit": "km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "hotel",
"_id": "1",
"_score": 1,
"_source": {
"name": "hotel1",
"location": {
"lat": 40.001,
"lon": -70.001
}
}
},
{
"_index": "test_index",
"_type": "zoo",
"_id": "1",
"_score": 1,
"_source": {
"name": "zoo1",
"location": {
"lat": 40.002,
"lon": -70.002
}
}
}
]
}
}
Here's the code I used to test it:
http://sense.qbox.io/gist/948d23a5327cf5f22dd368146f37d09e30765fee

Resources