I want to aggregate documents in elastic search - node.js

I have a huge collection of documents in elastic search and i want to group the documents and add the values for the same.
Sample document:
[
{
"_id": "123",
"meter_id": "1001",
"voltage": "{
"voltage": 50
}",
"date": 2020-05-09T06:03:56Z
}
{
"_id": "1234",
"meter_id": "1002",
"voltage": "{
"voltage": 40
}",
"date": 2020-04-10T06:03:56Z
}
]
Now i want to match this collection specific date range. For example dates between 2020-04-10 to 2020-05-09 and the documents matching this criteria should be grouped into a single document with common meter_id 1001 and average voltage of all documents.

POST _bulk
{"index":{"_index":"voltage","_type":"_doc"}}
{"meter_id":"1001","voltage":{"voltage":50},"date":"2020-04-09T06:03:56Z"}
{"index":{"_index":"voltage","_type":"_doc"}}
{"meter_id":"1001","voltage":{"voltage":60},"date":"2020-05-08T08:03:56Z"}
{"index":{"_index":"voltage","_type":"_doc"}}
{"meter_id":"1001","voltage":{"voltage":60},"date":"2020-05-01T08:03:56Z"}
GET voltage/_search
{
"size": 0,
"query": {
"range": {
"date": {
"gte": "2020-04-10",
"lte": "2020-05-09",
"format": "yyyy-MM-dd"
}
}
},
"aggs": {
"by_meter_id": {
"terms": {
"field": "meter_id.keyword"
},
"aggs": {
"avg_voltage": {
"avg": {
"field": "voltage.voltage"
}
}
}
}
}
}

Related

python elasticsearch-dsl return all unique values for specific key

I have a field called account_number . It contains random 6 character string.
I can't seem to get python elasticsearch dsl to return just those unique values.
search = Search(using=client, index=index_name).query(
{
"range": {
"date": {
"gte": "2021-08-01T08:00:00.000Z",
"lte": "2021-08-31T23:59:59.599Z"
#"format": "strict_date_optional_time"
}
}
})
search.aggs.bucket("account_number","terms",field="account_number",size="1000")
es_data = search.execute()
Not sure if I need to define the account_number in the query or if its in the agg bucket?. Right now I just get random full rows returned with all columns
Here is an example of a working query in non-dsl form. I didnt think the metric was necessary but maybe it is.
{
"aggs": {
"3": {
"terms": {
"field": "account_number",
"order": {
"1": "desc"
},
"size": 5
},
"aggs": {
"1": {
"sum": {
"field": "hits"
}
}
}
}
},
"size": 0,
"stored_fields": [
"*"
],
"script_fields": {},
"docvalue_fields": [
{
"field": "#timestamp",
"format": "date_time"
},
{
"field": "date",
"format": "date_time"
}
],
"_source": {
"excludes": []
},
"query": {
"bool": {
"must": [],
"filter": [
{
"match_all": {}
},
{
"range": {
"date": {
"gte": "2021-04-08T21:00:00.000Z",
"lte": "2021-10-08T21:00:00.000Z",
"format": "strict_date_optional_time"
}
}
}
]
}
}
}
You can add extra(size=0) to your query:
search = Search(using=client, index=index_name).query(
{
"range": {
"date": {
"gte": "2021-08-01T08:00:00.000Z",
"lte": "2021-08-31T23:59:59.599Z"
#"format": "strict_date_optional_time"
}
}
}).extra(size=0)
Then your es_data will be empty and es_data.aggregations.account_number.buckets will contain only unique account numbers.
Hope it helps.

How to perform sub aggregation that will calculate fields with no value per bucket?

Currently building the following Elasticsearch 6.8 query\aggregation:
{
"sort": [
{
"DateCreated": {
"order": "desc"
}
}
],
"query": {
"bool": {
"must": [
{
"match": {
"InternalEntityId": "ExampleValue1111"
}
},
{
"match": {
"Direction": "Inbound"
}
}
]
}
},
"aggs": {
"top_ext": {
"terms": {
"field": "ExternalAddress.keyword"
},
"aggs": {
"top_date": {
"top_hits": {
"sort": [
{
"DateCreated": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}
How do we perform (in the same search):
Count the sum of (hits per bucket) that have no value (must_not exists style query) PER bucket
Ideally, with the return of the top_ext agg return.. each bucket would have a count of the records that have no value.
Thanks!
Now you can do two things here,
1. Either sort the "top_ext" terms agg bucket by asc order of doc count and you can use the top n zero size buckets here
2. You can apply a bucket selector aggregation in parallel to you inner hits so that only those inner hits will appear that have zero docCounts.
Here is a query dsl that uses both the above approaches.(You can plug in all other required elements of the query, I have focused mainly on the aggregation part here)
GET kibana_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"outer": {
"terms": {
"field": "products.category.keyword",
"size": 10,
"order": {
"_count": "asc"
}
},
"aggs": {
"inner": {
"top_hits": {
"size": 10
}
},
"restrictedBuckets": {
"bucket_selector": {
"buckets_path": {
"docCount": "_count"
},
"script": "params.docCount<1"
}
}
}
}
}
}

Need pagination on Aggreration Groupping Elastic search

We have applying aggregation and grouping, Need pagination for this.
let body = {
size: item_per_page,
"query": {
"bool": {
"must": [{
"terms": {
"log_action_master_id": action_type
}
}, {
"match": {
[search_by]: searchParams.user_id
}
}, {
"match": {
"unit_id": searchParams.unit_id
}
},
{
"range": {
[search_date]: {
gte: from,
lte: to
}
}
}
]
}
},
"aggs": {
"group": {
"terms": {
"field": "id",
"size": item_per_page,
"order": { "_key": sortdirction }
},
},
"types_count": {
"value_count": {
"field": "id.keyword"
}
},
},
};
You can use below options:-
Composite Aggregation: can combine multiple datasources in a single buckets and allow pagination and sorting on it. It can only paginate linearly using after_key i.e you cannot jump from page 1 to page 3. You can fetch "n" records , then pass returned after key and fetch next "n" records.
GET index22/_search
{
"size": 0,
"aggs": {
"ValueCount": {
"value_count": {
"field": "id.keyword"
}
},
"pagination": {
"composite": {
"size": 2,
"sources": [
{
"TradeRef": {
"terms": {
"field": "id.keyword"
}
}
}
]
}
}
}
}
Include partition: group's the field’s values into a number of partitions at query-time and processing only one partition in each request. Term fields are evenly distributed in different partitions. So you must know number of terms beforehand. You can use cardinality aggregation to get count
GET index22/_search
{
"size": 0,
"aggs": {
"TradeRef": {
"terms": {
"field": "id.keyword",
"include": {
"partition": 0,
"num_partitions": 3
}
}
}
}
}
Bucket Sort aggregation : sorts the buckets of its parents multi bucket aggreation. Each bucket may be sorted based on its _key, _count or its sub-aggregations. It only applies to buckets returned from parent aggregation. You will need to set term size to 10,000(max value) and truncate buckets in bucket_sort. You can paginate using from and size just like in query. If you have terms more that 10,000 you won't be able to use it since it only selects from buckets returned by term.
GET index22/_search
{
"size": 0,
"aggs": {
"valueCount":{
"value_count": {
"field": "TradeRef.keyword"
}
},
"TradeRef": {
"terms": {
"field": "TradeRef.keyword",
"size": 10000
},
"aggs": {
"my_bucket": {
"bucket_sort": {
"sort": [
{
"_key": {
"order": "asc"
}
}
],
"from": 2,
"size": 1
}
}
}
}
}
}
In terms of performance composite aggregation is a better choice

Is possible to filter on a nested aggregation result

Imagine I have a movie document, and its ratings is modelled as nested fields:
"mappings": {
"movie": {
"properties": {
"name": {"type": "text"}
"ratings": {
"type": "nested"
"properties": {
"userId": {"type": "keyword"},
"rating": {"type": "integer"}
}
}
}
}
}
What I want to do is: for a given movie name, and a list of users' ids. I want to find the movie and lowest rating among these users. I managed to construct a query to do the job
{
"query": {
"bool": {
"must": [{
"match": {
"name": "fake movie name"
}
}],
"filter": {
"nested": {
"path": "ratings",
"query": {
"bool": {
"must": {
"match": {
"ratings.userId": ["user1", "user2"]
}
}
}
}
}
}
},
"aggs": {
"userIdFilter": {
"filter": {
"terms": {
"ratings.userId": ["user1", "user2"]
}
},
"aggs": {
"lowestRating": {
"min": {
"field": "ratings.rating"
}
}
}
}
}
}
}
Is possible to add filter on the lowest rating, only returns document's lowest rating is lower certain value?
I hope there is a way to approach this without using script, I tried bucket-selector-aggregation, but cannot get a working version. Any ideas?
Thank you

elastic search date range aggregation not giving complete data

I am Querying for getting aggregate data based on date_range, like below
"aggs": {
"range": {
"date_range": {
"field": "sold",
"ranges": [
{ "from": "2014-11-01", "to": "2014-11-30" },
{ "from": "2014-08-01", "to": "2014-08-31" }
]
}
}
}
using this I am getting this response
"aggregations": {
"range": {
"buckets": [
{
"key": "2014-08-01T00:00:00.000Z-2014-08-31T00:00:00.000Z",
"from": 1406851200000,
"from_as_string": "2014-08-01T00:00:00.000Z",
"to": 1409443200000,
"to_as_string": "2014-08-31T00:00:00.000Z",
"doc_count": 1
},
{
"key": "2014-11-01T00:00:00.000Z-2014-11-30T00:00:00.000Z",
"from": 1414800000000,
"from_as_string": "2014-11-01T00:00:00.000Z",
"to": 1417305600000,
"to_as_string": "2014-11-30T00:00:00.000Z",
"doc_count": 2
}
]
}
}
but instead of only doc_count, I have also required complete aggregate data that satisfy this range,
is threre any way to get this..please help
It's not clear what other fields you're looking for so I've included a couple of examples.
By nesting another aggs inside your first one, you can ask Elasticsearch to pull back additional values e.g. averages, sums, counts, min, max, stats, etc.
this example query will bring back field_count - a count of instances of myfield
and also return order_count - a sum based on a script.
"aggs": {
"range": {
"date_range": {
"field": "sold",
"ranges": [
{ "from": "2014-11-01", "to": "2014-11-30" },
{ "from": "2014-08-01", "to": "2014-08-31" }
]
}
}
},
"aggs" : {
"field_count": {"value_count" : { "field" : "myfield" } },
"order_count": {"sum" : {"script" : " doc[\"output_msgtype\"].value == \"order\" ? 1 : 0"} } }}
}
If you aren't looking for any sums, counts, averages on your data - then an aggregation isn't going to help.
I would instead run a standard query once per range. e.g.:
curl -XGET 'http://localhost:9200/test/cars/_search?pretty' -d '{
"fields" : ["price", "color", "make", "sold" ],
"query":{
"filtered": {
"query": {
"match_all" : { }
},
"filter" : {
"range": {"sold": {"gte": "2014-09-21T20:03:12.963","lte": "2014-09-24T20:03:12.963"}}}
}
}
}'
repeat this query as needed but modifying the range each time.

Resources