Elasticsearch correct aggregations for faceted search

Elasticsearch correct aggregations for faceted search - search

I want to offer a faceted search for the clothing products on some platform. As I already have Elasticsearch-based search functionality (simple query, only the name of the product), it would be nice to also implement the faceted search with ES.
This should be done with aggregations as facets are deprecated and one can also have nested aggregations.
However I cannot wrap my head around the millions of aggregations and which ones are the right for me - there is terms, filter, filters, nested, children and so on. And all of them seem suitable.
What I want to achieve may sound pretty basic: I have different facets (brand, condition, color) each having different values. For some facets (brand) the user can select only one value. For others (color) the user is allowed to select up to 3 (as some clothes have more than one color).
I started with multi-field terms facet. Now the next natural step would be to convert this to a terms aggregation (reasons above) but multi-fields are not supported in terms aggregations.
{
"query" : {
"match_all" : { }
},
"facets" : {
"groupByBrandAndCondition" : {
"terms" : {
"fields" : ["brand", "condition"],
"size" : 10
}
}
}
}
I am somehow missing some easy but crucial point here on how to proceed with having parallel multi level bucketing. Speaking in UI terms the user should be able to select something like:
Brands (10)
A (7)
B (3) [X]
Colors (5)
Blue (3) [X]
Red (2) [X]
Read: select A (7), Blue (3) and Red (2)

I created basic mapping like this
POST your_index/your_type/_mapping
{
"your_type": {
"properties": {
"product": {
"type": "string"
},
"brand": {
"type": "string"
},
"color": {
"type": "string"
}
}
}
}
I inserted some documents like this
PUT your_index/your_type/111
{
"product" : "jeans" ,"brand" : "lee", "color" : "blue"
}
PUT your_index/your_type/1111
{
"product" : "shoes" ,"brand" : "levi", "color" : "black"
}
And so on
Simple aggregation query like this
GET your_index/_search
{
"size": 0,
"aggs": {
"prod_agg": {
"terms": {
"field": "product"
},
"aggs": {
"brand_agg": {
"terms": {
"field": "brand"
},
"aggs": {
"color_agg": {
"terms": {
"field": "color"
}
}
}
}
}
}
}
}
will retrun
"aggregations": {
"prod_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "shoes",
"doc_count": 4,
"brand_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "nike",
"doc_count": 3,
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "blue",
"doc_count": 2
},
{
"key": "black",
"doc_count": 1
}
]
}
},
{
"key": "levi",
"doc_count": 1,
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 1
}
]
}
}
]
}
},
{
"key": "jeans",
"doc_count": 3,
"brand_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "lee",
"doc_count": 2,
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 1
},
{
"key": "blue",
"doc_count": 1
}
]
}
},
{
"key": "levi",
"doc_count": 1,
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 1
}
]
}
}
]
}
}
]
}
}
This could be used to populate UI search criteria.
Then If user wants to search for shoes, you could query
GET your_index/_search
{
"size": 0,
"query": {
"match": {
"product": "shoes"
}
},
"aggs": {
"brand_agg": {
"terms": {
"field": "brand"
},
"aggs": {
"color_agg": {
"terms": {
"field": "color"
}
}
}
}
}
}
which will give you
"aggregations": {
"brand_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "nike",
"doc_count": 3,
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "blue",
"doc_count": 2
},
{
"key": "black",
"doc_count": 1
}
]
}
},
{
"key": "levi",
"doc_count": 1,
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 1
}
]
}
}
]
}
}
or you could have them as separate buckets with query like
GET your_index/_search
{
"size": 0,
"query": {
"match": {
"product": "shoes"
}
},
"aggs": {
"brand_agg": {
"terms": {
"field": "brand"
}
},
"color_agg" : {
"terms": {
"field": "color"
}
}
}
}
which will give you
"aggregations": {
"color_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "black",
"doc_count": 2
},
{
"key": "blue",
"doc_count": 2
}
]
},
"brand_agg": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "nike",
"doc_count": 3
},
{
"key": "levi",
"doc_count": 1
}
]
}
}
Use doc_count value to tell users how many options they have.
Does this satisfy your requirements?

Related

How to calculate total for each token in Elasticsearch

I have a request into Elastic
{
"query":{
"bool":{
"must":[
{
"query_string":{
"query":"something1 OR something2 OR something3",
"default_operator":"OR"
}
}
],
"filter":{
"range":{
"time":{
"gte":date
}
}
}
}
}
}
I wanna calculate count for each token in all documents using elastic search in one request, for example:
something1: 26 documents
something2: 12 documents
something3: 1 documents

Assuming that the tokens are not akin to enumerations (i.e. constrained set of specific values, like state names, which would make a terms aggregation your best bet with the right mapping), I think the closest thing to what you want would be to use filters aggregation:
POST your-index/_search
{
"query":{
"bool":{
"must":[
{
"query_string":{
"query":"something1 OR something2 OR something3",
"default_operator":"OR"
}
}
],
"filter":{
"range":{
"time":{
"gte":date
}
}
}
}
},
"aggs": {
"token_doc_counts": {
"filters" : {
"filters" : {
"something1" : {
"bool": {
"must": { "query_string" : { "query" : "something1" } },
"filter": { "range": { "time": { "gte": date } } }
}
},
"something2" : {
"bool": {
"must": { "query_string" : { "query" : "something2" } },
"filter": { "range": { "time": { "gte": date } } }
}
},
"something3" : {
"bool": {
"must": { "query_string" : { "query" : "something3" } },
"filter": { "range": { "time": { "gte": date } } }
}
}
}
}
}
}
}
The response would look something like:
{
"took": 9,
"timed_out": false,
"_shards": ...,
"hits": ...,
"aggregations": {
"token_doc_counts": {
"buckets": {
"something1": {
"doc_count": 1
},
"something2": {
"doc_count": 2
},
"something3": {
"doc_count": 3
}
}
}
}
}

You can split your query into filters aggregation of three filters. For reference look here: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-filters-aggregation.html

What you would need to do, is to create a Copy_To field and have the mapping as shown below.
Depending on the fields that your query_string queries, you need to include some or all of the fields with copy_to field.
By default query_string searches all the fields, so you may need to specify copy_to for all the fields as shown in below mapping, where for sake of simplicity, I've created only three fields, title, field_2 and a third field content which would act as copied to field.
Mapping
PUT <your_index_name>
{
"mappings": {
"mydocs": {
"properties": {
"title": {
"type": "text",
"copy_to": "content"
},
"field_2": {
"type": "text",
"copy_to": "content"
},
"content": {
"type": "text",
"fielddata": true
}
}
}
}
}
Sample Documents
POST <your_index_name>/mydocs/1
{
"title": "something1",
"field_2": "something2"
}
POST <your_index_name>/mydocs/2
{
"title": "something2",
"field_2": "something3"
}
Query:
You'd get the required document counts for the each and every token using the below aggregation query and I've made use of Terms Aggregation:
POST <your_index_name>/_search
{
"size": 0,
"query": {
"query_string": {
"query": "something1 OR something2 OR something3"
}
},
"aggs": {
"myaggs": {
"terms": {
"field": "content",
"include" : ["something1","something2","something3"]
}
}
}
}
Query Response:
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0,
"hits": []
},
"aggregations": {
"myaggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "something2",
"doc_count": 2
},
{
"key": "something1",
"doc_count": 1
},
{
"key": "something3",
"doc_count": 1
}
]
}
}
}
Let me know if it helps!

Elasticsearch aggrecation give me 2 results insted of one result

I want to aggregate on the brand field and is give me two results instead of one
The brands_aggs give me from this text
{name : "Brand 1"}
2 results
Brand and 1
But Why I need only Brand 1
is separate the word brand and 1 from (Brand 1)
and is give me 2 results in the aggrecation
my mappings where I want to aggregate
mapping = {
"mappings": {
"product": {
"properties": {
"categories": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"fielddata": True
}
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"fielddata": True
}
}
}
}
}
my post request
{
"query" : {
"bool": {
"must": [
{"match": { "categories": "AV8KW5Wi31qHZdVeXG4G" }}
]
}
},
"size" : 0,
"aggs" : {
"brand_aggs" : {
"terms" : { "field" : "brand" }
},
"categories_aggs" : {
"terms" : { "field" : "categories" }
}
}
}
response from the server
{
"took": 18,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"categories_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "av8kw5wi31qhzdvexg4g",
"doc_count": 1
},
{
"key": "av8kw61c31qhzdvexg4h",
"doc_count": 1
},
{
"key": "av8kxtch31qhzdvexg4a",
"doc_count": 1
}
]
},
"brand_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1", <==== I dont need this , why is give me that ??
"doc_count": 1
},
{
"key": "brand",
"doc_count": 1
}
]
},
}
}

Your mapping has property fields which is used when you want to have multiple analyzers for the same field. In your case valid name of your field is 'brand.keyword'. When you call your aggregate for just 'brand' it use default mapping defined for string.
So your query should be:
{
"query" : {
"bool": {
"must": [
{"match": { "categories": "AV8KW5Wi31qHZdVeXG4G" }}
]
}
},
"size" : 0,
"aggs" : {
"brand_aggs" : {
"terms" : { "field" : "brand.keyword" }
},
"categories_aggs" : {
"terms" : { "field" : "categories.keyword" }
}
}
}
Property field is useful when you want for example search the same property which multiple analyzers, for example:
"full_name": {
"type": "text",
"analyzer": "standard",
"boost": 1,
"fields": {
"autocomplete": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"standard":{
"type": "text",
"analyzer": "standard"
}
}
},

You need to map your string as not_analyzed string, for that run the below query
PUT your_index/_mapping/your_type
{
"your_type": {
"properties": {
"brand": {
"type": "string",
"index": "analyzed",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
Don't forget to replace the your_type and your_index with your type and index values.

Elasticsearch aggregate unique attribute per groupId

I have a separate doc in elasticsearch for each product.
Each product has a unique productId and a non unique groupId along with other attributes eg: categories.
I want to be able to aggregate different attributes with their count per unique groupId
example:
doc 1:
{
"productId": 123
"groupId" xyz,
"categories": [{"value": "shoes"}, {"value": "t-shirt"}]
}
doc 2:
{
"productId": 345
"groupId" xyz,
"categories": [{"value": "shoes"}, {"value": "t-shirt"}]
}
doc 3:
{
"productId": 456
"groupId" abc,
"categories": [{"value": "t-shirt"}]
}
doc 4:
{
"productId": 567
"groupId" abc,
"categories": [{"value": "shoes"}, {"value": "makeup"}]
}
expected results, something like:
shoes: 2
t-shirt: 2
makeup: 1
so I want to count each item once if it exists with the same groupId
my query:
{
"from":0,
"size":0,
"query":{
"filtered":{
"filter":{
}
}
},
"aggs": {
"group": {
"terms": {"field": "group"},
"aggs": {
"brand": {
"terms": {"field": "productMeta.brand.value"}
}
}
}
}
}
response:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 25,
"max_score": 0,
"hits": []
},
"aggregations": {
"group": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 3,
"buckets": [
{
"key": "wlmr34210507",
"doc_count": 8,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "generic",
"doc_count": 8
}
]
}
},
{
"key": "wlmr19524441",
"doc_count": 4,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "maybelline",
"doc_count": 4
}
]
}
},
{
"key": "wlmr34121549",
"doc_count": 2,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "maybelline",
"doc_count": 2
}
]
}
},
{
"key": "wlmr34317301",
"doc_count": 2,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "dream on me",
"doc_count": 2
}
]
}
},
{
"key": "bbfs40549552",
"doc_count": 1,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "samsung",
"doc_count": 1
}
]
}
},
{
"key": "bobb7937347",
"doc_count": 1,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "chicco",
"doc_count": 1
}
]
}
},
{
"key": "wlmr24241413",
"doc_count": 1,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "maybelline",
"doc_count": 1
}
]
}
},
{
"key": "wlmr27504560",
"doc_count": 1,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mr. beer",
"doc_count": 1
}
]
}
},
{
"key": "wlmr33986448",
"doc_count": 1,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "mr. beer",
"doc_count": 1
}
]
}
},
{
"key": "wlmr40806575",
"doc_count": 1,
"brand": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "healthtex",
"doc_count": 1
}
]
}
}
]
}
}
}

so basically I was able to solve this problem by using cardinality as follows:
{
"from":0,
"size":0,
"query":{
"filtered":{
"filter":{
}
}
},
"sort":{
"ts":{
"order":"desc",
"mode":"max",
"ignore_unmapped":true
}
},
"aggs":{
"categories":{
"terms":{
"field":"productMeta.brand.value",
"size":0
},
"aggs": {
"category" : {
"cardinality" : {
"field" : "group"
}
}
}
}
}
}
the results are unique count per productId per category:
{
"took": 4,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 71,
"max_score": 0,
"hits": []
},
"aggregations": {
"categories": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "chocolate",
"doc_count": 23,
"category": {
"value": 23
}
},
{
"key": "notebook",
"doc_count": 9,
"category": {
"value": 1
}
},
{
"key": "olive_oil",
"doc_count": 7,
"category": {
"value": 7
}
},
{
"key": "physical_training",
"doc_count": 5,
"category": {
"value": 5
}
},
{
"key": "ski",
"doc_count": 5,
"category": {
"value": 2
}
},
{
"key": "gym_membership",
"doc_count": 4,
"category": {
"value": 4
}
},
{
"key": "ski_boots",
"doc_count": 4,
"category": {
"value": 1
}
},
{
"key": "vinegar",
"doc_count": 4,
"category": {
"value": 4
}
},
{
"key": "bracelet",
"doc_count": 3,
"category": {
"value": 3
}
},
{
"key": "handbags",
"doc_count": 2,
"category": {
"value": 2
}
},
{
"key": "cider",
"doc_count": 1,
"category": {
"value": 1
}
},
{
"key": "ice_cider",
"doc_count": 1,
"category": {
"value": 1
}
},
{
"key": "jewelry_1",
"doc_count": 1,
"category": {
"value": 1
}
},
{
"key": "laces",
"doc_count": 1,
"category": {
"value": 1
}
},
{
"key": "stationery",
"doc_count": 1,
"category": {
"value": 1
}
}
]
}
}
}

Using nested elasticsearch Terms Aggregations the same way you are currently doing with brands, you can produce output structured like this:
"aggregations": {
"groups": {
"buckets": [
{
"key": "xyz",
"categories_per_group": {
"buckets": [
{
"key": "shoes",
"doc_count": 2
},
{
"key": "t-shirt",
"doc_count": 2
}
]
}
},
{
"key": "abc",
"categories_per_group": {
"buckets": [
{
"key": "shoes",
"doc_count": 1
},
{
"key": "t-shirt",
"doc_count": 1
},
{
"key": "makeup",
"doc_count": 1
}
]
}
}
]
}
}
It might be possible to write a pipeline bucket script (ES 2.x) to collect the count of distinct categories across group buckets as you're suggesting.
But it's probably simpler and quicker just to implement this reduce logic yourself using the aggregation buckets output above.

Elastic Search 1.7: Returning Empty Buckets in Nested Terms Aggregation

I have an aggregation query here that is:
Return the number of Records by Type, grouped by Creator in the last 6 months.
The query is as follows:
GET /test/records/_search?search_type=count
{
"aggs": {
"timeRange": {
"filter": {
"range": {
"When": {
"gte": "now-6M",
"lte": "now"
}
}
},
"aggs": {
"groupBy": {
"terms": {
"field": "Creator",
"min_doc_count": 0
},
"aggs": {
"counts": {
"terms": {
"field": "Type",
"min_doc_count": 0
}
}
}
}
}
}
}
}
And the Results appear as:
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 261,
"max_score": 0,
"hits": []
},
"aggregations": {
"timeRange": {
"doc_count": 192,
"groupBy": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "ff94d50a-9ced-4877-85cc-a08a00fd49f4",
"doc_count": 175,
"counts": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 3,
"buckets": [
{
"key": "9f937783-dc28-421c-a937-a0c201643aae",
"doc_count": 95
},
{
"key": "36e4b200-e8ca-47f5-b9bb-a09101058595",
"doc_count": 31
},
{
"key": "cf421f05-37b1-470e-9ab9-a0bb0100792d",
"doc_count": 11
}
]
}
},
{
"key": "be8ca900-0011-0002-1976-c737a7e00000",
"doc_count": 0,
"counts": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
},
{
"key": "fae866a8-705e-e111-bd17-d6ec07ced130",
"doc_count": 0,
"counts": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": []
}
}
]
}
}
}
}
From the results listed above, it appears that when there is a 0 count in a terms aggregation, it won't bother listing 0s for the terms in levels below it.
My question is: Is it possible to have it show 0 counts for in these scenarios?
Example desired output below:
{
"took": 11,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 261,
"max_score": 0,
"hits": []
},
"aggregations": {
"timeRange": {
"doc_count": 192,
"groupBy": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "ff94d50a-9ced-4877-85cc-a08a00fd49f4",
"doc_count": 175,
"counts": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 3,
"buckets": [
{
"key": "9f937783-dc28-421c-a937-a0c201643aae",
"doc_count": 95
},
{
"key": "36e4b200-e8ca-47f5-b9bb-a09101058595",
"doc_count": 31
},
{
"key": "cf421f05-37b1-470e-9ab9-a0bb0100792d",
"doc_count": 11
}
]
}
},
{
"key": "be8ca900-0011-0002-1976-c737a7e00000",
"doc_count": 0,
"counts": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "9f937783-dc28-421c-a937-a0c201643aae",
"doc_count": 0
},
{
"key": "36e4b200-e8ca-47f5-b9bb-a09101058595",
"doc_count": 0
},
{
"key": "cf421f05-37b1-470e-9ab9-a0bb0100792d",
"doc_count": 0
}
]
}
},
{
"key": "fae866a8-705e-e111-bd17-d6ec07ced130",
"doc_count": 0,
"counts": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "9f937783-dc28-421c-a937-a0c201643aae",
"doc_count": 0
},
{
"key": "36e4b200-e8ca-47f5-b9bb-a09101058595",
"doc_count": 0
},
{
"key": "cf421f05-37b1-470e-9ab9-a0bb0100792d",
"doc_count": 0
}
]
}
}
]
}
}
}
}

This is answered as a comment to the main post by Clemens Klein-Robbenhaar. In short: I know what values to expect, but Elasticsearch does not. Once it counts that the group has an aggregation total of 0, it doesn't know what to search under that, because there is nothing.

Elastic(search): Get docs with max and min timestamp values

I got a problem with a search I just can't figure out how to do it. My docs are of the following form:
{
"timestamp":"2015-03-17T15:05:04.563Z",
"session_id":"1",
"user_id":"jan"
}
Let's say the first timestamp of a session id is the "Login" and the last timestamp is the "Logout". I want to have all "login" and "logout" docs for all sessions (if possible sorted by user_id). I managed to get the right timestamps with aggregations:
{
"aggs" : {
"group_by_uid" : {
"terms" : {
"field" : "user_id"
},
"aggs" : {
"group_by_sid" : {
"terms" : {
"field" : "session_id"
},
"aggs" : {
"max_date" : {
"max": { "field" : "timestamp" }
},
"min_date" : {
"min": { "field" : "timestamp" }
}
}
}
}
}
}
}
But how do I get the corresponding docs? I also don't mind if i have to do 2 searches (one for the logins and one for the logouts). I tried tome top hits aggregations and sorting stuff but I always get parse errors :/
I hope someone can give me a hint :)
Best regards,
Jan

Here's a solution in a single search based on the approach proposed by Sloan Ahrens. The advantage is that the start and end session entries are in the same bucket.
{
"aggs": {
"group_by_uid": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_by_sid": {
"terms": {
"field": "session_id"
},
"aggs": {
"session_start": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "asc" } } ]
}
},
"session_end": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "desc" } } ]
}
}
}
}
}
}
}
}
Cheers,
Jan

You're already close. How about this. Use two searches, each aggregating the way you did, but then also get the first top_hit sorting on "timestamp".
I just set up a basic index and added some data that looks like what you posted:
PUT /test_index
{
"settings": {
"number_of_shards": 1
}
}
POST /test_index/_bulk
{"index":{"_index":"test_index","_type":"doc","_id":1}}
{"timestamp":"2015-03-17T15:05:04.563Z","session_id":"1","user_id":"jan"}
{"index":{"_index":"test_index","_type":"doc","_id":2}}
{"timestamp":"2015-03-17T15:10:04.563Z","session_id":"1","user_id":"jan"}
{"index":{"_index":"test_index","_type":"doc","_id":3}}
{"timestamp":"2015-03-17T15:15:04.563Z","session_id":"1","user_id":"jan"}
{"index":{"_index":"test_index","_type":"doc","_id":4}}
{"timestamp":"2015-03-17T18:05:04.563Z","session_id":"1","user_id":"bob"}
{"index":{"_index":"test_index","_type":"doc","_id":5}}
{"timestamp":"2015-03-17T18:10:04.563Z","session_id":"1","user_id":"bob"}
{"index":{"_index":"test_index","_type":"doc","_id":6}}
{"timestamp":"2015-03-17T18:15:04.563Z","session_id":"1","user_id":"bob"}
Then I can get each session's start time with:
POST /test_index/_search?search_type=count
{
"aggs": {
"group_by_uid": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_by_sid": {
"terms": {
"field": "session_id"
},
"aggs": {
"session_start": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "asc" } } ]
}
}
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"group_by_uid": {
"buckets": [
{
"key": "bob",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_start": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "4",
"_score": null,
"_source": {
"timestamp": "2015-03-17T18:05:04.563Z",
"session_id": "1",
"user_id": "bob"
},
"sort": [
1426615504563
]
}
]
}
}
}
]
}
},
{
"key": "jan",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_start": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": null,
"_source": {
"timestamp": "2015-03-17T15:05:04.563Z",
"session_id": "1",
"user_id": "jan"
},
"sort": [
1426604704563
]
}
]
}
}
}
]
}
}
]
}
}
}
and end-time with:
POST /test_index/_search?search_type=count
{
"aggs": {
"group_by_uid": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_by_sid": {
"terms": {
"field": "session_id"
},
"aggs": {
"session_end": {
"top_hits": {
"size": 1,
"sort": [ { "timestamp": { "order": "desc" } } ]
}
}
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 6,
"max_score": 0,
"hits": []
},
"aggregations": {
"group_by_uid": {
"buckets": [
{
"key": "bob",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_end": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "6",
"_score": null,
"_source": {
"timestamp": "2015-03-17T18:15:04.563Z",
"session_id": "1",
"user_id": "bob"
},
"sort": [
1426616104563
]
}
]
}
}
}
]
}
},
{
"key": "jan",
"doc_count": 3,
"group_by_sid": {
"buckets": [
{
"key": "1",
"doc_count": 3,
"session_end": {
"hits": {
"total": 3,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "3",
"_score": null,
"_source": {
"timestamp": "2015-03-17T15:15:04.563Z",
"session_id": "1",
"user_id": "jan"
},
"sort": [
1426605304563
]
}
]
}
}
}
]
}
}
]
}
}
}
Here's the code I used:
http://sense.qbox.io/gist/05edb48b840e6a992646643913db8ef0a3ccccb3

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Elasticsearch correct aggregations for faceted search - search

Related

How to calculate total for each token in Elasticsearch

Elasticsearch aggrecation give me 2 results insted of one result

Elasticsearch aggregate unique attribute per groupId

Elastic Search 1.7: Returning Empty Buckets in Nested Terms Aggregation

Elastic(search): Get docs with max and min timestamp values

Categories

Resources