Opensearch transform splitting array values to new events - transform

I'm transforming index that contains following event.
But the values inside of array are splitting into the new events.
e.g:
"serviceIdentifiers": "Redis"
"serviceIdentifiers":"Event_Detector Servicc"
etc.
{
"_index": "collated_txn_health_2022.05",
"_type": "_doc",
"_id": "LAUpboIBh6CUatILrsN3",
"_score": 1,
"_source": {
"timeInGMT": 0,
"kpiId": 0,
"compInstanceIdentifier": "d0352b7d-0484-4714-bbc8-eb67cbb7be70",
"agentIdentifier": "ComponentAgent-171",
"kpiIdentifier": "PACKETS_DROPPED",
"categoryIdentifier": "Network Utilization",
"applicationIdentifier": null,
"serviceIdentifiers": [
"Supervisor_Controller Service",
"Event_Detector Service",
"UI_Service",
"Redis",
"CC_Service"
],
"clusterIdentifiers": [
"a5c57ef5-4018-41b8-b727-27c8f8376c0e"
],
"collectionInterval": 60,
"value": "0.0",
"kpiType": "Core",
"groupAttribute": "ALL",
"groupIdentifier": null,
"watcherValue": null,
"errorCode": null,
"clusterOperation": null,
"aggLevelInMins": 1,
"error": false,
"kpiGroup": false,
"discovery": false,
"maintenanceExcluded": false,
"#timestamp": "2022-05-01T01:32:00.000Z"
}
Following is the transform job configuration.
curl -u admin:admin -XPUT "http://XXX.XXX.XX.XXX9201/_plugins/_transform/my-array-job-2" -H 'Content-type: application/json' -d'
{
"transform": {
"schedule": {
"interval": {
"start_time": 1659705000000,
"period": 1,
"unit": "Minutes"
}
},
"metadata_id": null,
"updated_at": 1659456180000,
"enabled": true,
"enabled_at": 1659457620000,
"description": "",
"source_index": "collated_txn_health_2022.05",
"data_selection_query": {
"match_all": {
"boost": 1
}
},
"target_index": "transform_collated_txn_health_2022.05",
"page_size": 1000,
"groups": [
{
"date_histogram": {
"fixed_interval": "1m",
"source_field": "#timestamp",
"target_field": "#timestamp",
"timezone": "Asia/Calcutta"
}
},
{
"terms": {
"source_field": "clusterIdentifiers",
"target_field": "clusterIdentifiers"
}
},
{
"terms": {
"source_field": "serviceIdentifiers",
"target_field": "serviceIdentifiers"
}
},
{
"terms": {
"source_field": "compInstanceIdentifier",
"target_field": "compInstanceIdentifier"
}
},
{
"terms": {
"source_field": "agentIdentifier",
"target_field": "agentIdentifier"
}
}
],
"aggregations": {
"count_#timestamp": {
"value_count": {
"field": "#timestamp"
}
}
}
}
}'
Following are the events from the transform index.
{
"_index": "transform_heal_collated_txn_health_2022.05",
"_type": "_doc",
"_id": "ybK0McQ9NZrt9xdo9iWKbA",
"_score": 1,
"_source": {
"transform._id": "my-array-job-2",
"transform._doc_count": 2,
"#timestamp": 1651365120000,
"clusterIdentifiers": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"serviceIdentifiers": "Redis",
"compInstanceIdentifier": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"agentIdentifier": "ComponentAgent-170",
"count_#timestamp": 2
}
},
{
"_index": "transform_heal_collated_txn_health_2022.05",
"_type": "_doc",
"_id": "Wf-4KwnFaYuw9bL-V-9WEQ",
"_score": 1,
"_source": {
"transform._id": "my-array-job-2",
"transform._doc_count": 2,
"#timestamp": 1651365120000,
"clusterIdentifiers": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"serviceIdentifiers": "Redis_Server Service",
"compInstanceIdentifier": "a5c57ef5-4018-41b8-b727-27c8f8376c0e",
"agentIdentifier": "ComponentAgent-170",
"count_#timestamp": 2
}
It would be a great help if somebody suggest me with solution for array fields.

Have solved the issue with following painless script. Which help to transform array fields in opensearch.
PUT _plugins/_transform/my-array-job-2
{
"transform": {
"schedule": {
"interval": {
"start_time": 1659705000000,
"period": 1,
"unit": "Minutes"
}
},
"metadata_id": null,
"updated_at": 1659456180000,
"enabled": true,
"enabled_at": 1659457620000,
"description": "",
"source_index": "heal_collated_txn_heal_health_2022.05_reindex",
"target_index": "transform_heal_collated_txn_heal_health_2022.05",
"page_size": 1000,
"groups": [
{
"date_histogram": {
"fixed_interval": "1m",
"source_field": "#timestamp",
"target_field": "#timestamp",
"timezone": "Asia/Calcutta"
}
},
{
"terms": {
"source_field": "kpiIdentifier",
"target_field": "kpiIdentifier"
}
},
{
"terms": {
"source_field": "clusterIdentifiers",
"target_field": "clusterIdentifiers"
}
}
],
"aggregations": {
"count_#timestamp": {
"value_count": {
"field": "#timestamp"
}
},
"count_agentIdentifier": {
"value_count": {
"field": "agentIdentifier"
}
},
"sum_value": {
"sum": {
"field": "value"
}
},
"max_value": {
"max": {
"field": "value"
}
},
"avg_value": {
"avg": {
"field": "value"
}
},
"count_value": {
"value_count": {
"field": "value"
}
},
"percentiles_value": {
"percentiles": {
"field": "value",
"percents": [
95
],
"keyed": true,
"tdigest": {
"compression": 100
}
}
},
"serviceIdentifiers": {
"scripted_metric": {
"init_script": "state.docs = []",
"map_script": """
Map span = [
'url':doc['serviceIdentifiers']
];
state.docs.add(span)
""",
"combine_script": "return state.docs;",
"reduce_script": """
def all_docs = [];
for (s in states) {
for (span in s) {
all_docs.add(span);
}
}
def size = all_docs.size();
def serviceIdentifiers_1 = all_docs[0]['url'];
def ret = new HashMap();
ret['serviceIdentifiers'] = serviceIdentifiers_1;
return ret;
"""
}
}
}
}
}

Related

How to do elasticsearch aggregation together with sort and find duplicate values

I want to find duplicate values and if there are duplicate values then I sort based on the last update, so what I take is the newest one, how do I do aggregations? I've tried this aggregation.
I've tried adding sort to sources but it still doesn't work, I've tried several ways but it still fails sometimes it comes out 1 but only old data, sometimes the order is correct from the newest but appears 2 data
{
"size": 0,
"query": {
"bool": {
"must": [
{
"match": {
"BILLING_TYPE_CD": "Service Bundle"
}
},
{
"match": {
"ID": "xxxx"
}
},
{
"exists": {
"field": "LI_MILESTONE"
}
},
{
"exists": {
"field": "LI_SID"
}
},
{
"query_string": {
"default_field": "LI_SID",
"query": "*xxxx*"
}
}
],
"must_not": {
"bool": {
"must": [
{
"query_string": {
"default_field": "LI_PRODUCT_NAME",
"query": "*Network*"
}
},
{
"terms": {
"LI_MILESTONE.keyword": [
"Abandoned",
"Cancelled"
]
}
},
{
"terms": {
"ORDER_STATUS.keyword": [
"Abandoned",
"Cancelled",
"Drop In Progress"
]
}
},
{
"term": {
"STATUS.keyword": ""
}
}
]
}
}
}
},
"sort": [
{
"TGL_CREATED": {
"order": "desc"
}
}
],
"aggs": {
"list_products": {
"composite": {
"size": 50000,
"sources": [
{
"LI_SID": {
"terms": {
"field": "LI_SID.keyword",
"order": "desc"
}
}
}
]
},
"aggs": {
"totalService": {
"terms": {
"field": "LI_SID.keyword",
"size": 50000,
"order": {
"_term": "asc"
}
}
},
"bucket_sort": {
"bucket_sort": {
"from": 0,
"size": 10
}
},
"includes_source": {
"top_hits": {
"size": 1,
"_source": {
"includes": [
"LAST_UPDATE",
"xxxxx",
"xxxxx",
"xxxxx",
"xxx"
]
}
}
}
}
},
"term_product": {
"terms": {
"field": "LI_SID.keyword",
"size": 50000
}
}
}
}
Like this ?
{
"aggs": {
"LI_SID": {
"terms": {
"field": "LI_SID.keyword",
"size": 10
},
"aggs": {
"hit": {
"top_hits": {
"size": 1,
"sort": [
{
"LAST_UPDATE": "desc"
}
]
}
}
}
}
},
"size": 0
}
You need to use aggregations response not hits

ElasticSearch Highlighting fails on match queries against index with Ngram Analyzer

I have created an index with ngram analyzer set on all fields in the index and custom _all. After indexing few documents, I am trying to query against the index to have suggestion like feature.
The output of the query does return results but they are not highlighted.
Analyzer Settings:
"analysis": {
"analyzer": {
"my_edgegram_analyzer": {
"filter": [
"lowercase"
],
"tokenizer": "my_edge_tokenizer"
}
},
"tokenizer": {
"my_edge_tokenizer": {
"token_chars": [
"letter",
"digit",
"punctuation",
"symbol"
],
"min_gram": "3",
"type": "ngram",
"max_gram": "26"
}
}
}
Mapping:
{
"st1": {
"mappings": {
"a": {
"_all": {
"enabled": false
},
"dynamic_templates": [
{
"catch_all": {
"match": "imp*",
"match_mapping_type": "string",
"mapping": {
"analyzer": "my_edgegram_analyzer",
"copy_to": "catch_all",
"norms": false,
"type": "text"
}
}
}
],
"properties": {
"catch_all": {
"type": "text",
"store": true,
"analyzer": "my_edgegram_analyzer"
},
"imp_server_id": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
},
"imp_server_name": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
}
}
},
"b": {
"_all": {
"enabled": false
},
"dynamic_templates": [
{
"catch_all": {
"match": "imp*",
"match_mapping_type": "string",
"mapping": {
"analyzer": "my_edgegram_analyzer",
"copy_to": "catch_all",
"norms": false,
"type": "text"
}
}
}
],
"properties": {
"catch_all": {
"type": "text",
"store": true,
"analyzer": "my_edgegram_analyzer"
},
"imp_server_id": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
},
"imp_server_name": {
"type": "text",
"norms": false,
"copy_to": [
"catch_all"
],
"analyzer": "my_edgegram_analyzer"
}
}
}
}
}
}
Documents:
http://localhost:9200/st1/b/1
{"imp_server_name":"abc1-4-jam9.my.test.com","imp_server_id":"vrock2-us"}
http://localhost:9200/st1/a/1
{"imp_server_name":"abc2-5-ajm9.my.test.com","imp_server_id":"vrock2-us"}
Query:
{
"query": {
"match": {
"catch_all": {
"query":"test",
"analyzer": "keyword"
}
}
},
"highlight": {
"pre_tags": ["<b>"],
"post_tags": ["</b>"],
"fields": {
"*": {}
},
"require_field_match": false
}
}
Response:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 0.16292635,
"hits": [
{
"_index": "st1",
"_type": "a",
"_id": "1",
"_score": 0.16292635,
"_source": {
"imp_server_name": "abc2-5-ajm9.my.test.com",
"imp_server_id": "vrock2-us"
},
"highlight": {
"imp_server_name": [
"abc2-5-ajm9.my.test.com"
],
"catch_all": [
"abc2-5-ajm9.my.test.com"
]
}
},
{
"_index": "st1",
"_type": "b",
"_id": "1",
"_score": 0.16292635,
"_source": {
"imp_server_name": "abc1-4-jam9.my.test.com",
"imp_server_id": "vrock2-us"
},
"highlight": {
"imp_server_name": [
"abc1-4-jam9.my.test.com"
],
"catch_all": [
"abc1-4-jam9.my.test.com"
]
}
}
]
}
}
How can I make highlight work in the above scenario. Below is the expected output:
Expected output:
"highlight": {
"imp_server_name": [
"abc2-5-ajm9.my.<b>test</b>.com"
],
"catch_all": [
"abc2-5-ajm9.my.<b>test</b>.com"
]
}
I was able to get the results by setting term_vector in the schema.
"term_vector": "with_positions_offsets"

ElasticSearch : GeoLocation distance search across types

As shown below, there are two types in my city index - zoo and hotel. How do I find all zoos having a hotel in 1KM radius? Here is the mapping of my index :
GET /city/_mapping
{
"city": {
"mappings": {
"hotel": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
},
"zoo": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
}
}
}
}
You can do it with a geo-distance filter for the whole index (just don't specify a type).
As I quick test I created an index like this:
PUT /test_index/
{
"mappings": {
"hotel": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
},
"zoo": {
"properties": {
"location": {
"type": "geo_point"
},
"name": {
"type": "string"
}
}
}
}
}
Added a couple of documents
POST /test_index/_bulk
{"index":{"_type":"hotel","_id":1}}
{"name":"hotel1","location":{"lat" : 40.001, "lon" : -70.001}}
{"index":{"_type":"zoo","_id":1}}
{"name":"zoo1","location":{"lat" : 40.002, "lon" : -70.002}}
And then I can search like this. This query returns the one document:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 200,
"distance_unit": "km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
}
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "hotel",
"_id": "1",
"_score": 1,
"_source": {
"name": "hotel1",
"location": {
"lat": 40.001,
"lon": -70.001
}
}
}
]
}
}
And this query returns both:
POST /test_index/_search
{
"query": {
"filtered": {
"filter": {
"geo_distance": {
"distance": 300,
"distance_unit": "km",
"location": {
"lat": 40,
"lon": -70
}
}
}
}
}
}
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "hotel",
"_id": "1",
"_score": 1,
"_source": {
"name": "hotel1",
"location": {
"lat": 40.001,
"lon": -70.001
}
}
},
{
"_index": "test_index",
"_type": "zoo",
"_id": "1",
"_score": 1,
"_source": {
"name": "zoo1",
"location": {
"lat": 40.002,
"lon": -70.002
}
}
}
]
}
}
Here's the code I used to test it:
http://sense.qbox.io/gist/948d23a5327cf5f22dd368146f37d09e30765fee

ElasticSearch query stops working with big amount of data

The problem: I have 2 identical in terms of settings and mappings indexes.
The first index contains only 1 document.
The second index contains the same document + 16M of others.
When I'm running the query on the first index it returns the document, but when I do the same query on the second — I receive nothing.
Indexes settings:
{
"tasks_test": {
"settings": {
"index": {
"analysis": {
"analyzer": {
"tag_analyzer": {
"filter": [
"lowercase",
"tag_filter"
],
"tokenizer": "whitespace",
"type": "custom"
}
},
"filter": {
"tag_filter": {
"type": "word_delimiter",
"type_table": "# => ALPHA"
}
}
},
"creation_date": "1444127141035",
"number_of_replicas": "2",
"number_of_shards": "5",
"uuid": "wTe6WVtLRTq0XwmaLb7BLg",
"version": {
"created": "1050199"
}
}
}
}
}
Mappings:
{
"tasks_test": {
"mappings": {
"Task": {
"dynamic": "false",
"properties": {
"format": "dateOptionalTime",
"include_in_all": false,
"type": "date"
},
"is_private": {
"type": "boolean"
},
"last_timestamp": {
"type": "integer"
},
"name": {
"analyzer": "tag_analyzer",
"type": "string"
},
"project_id": {
"include_in_all": false,
"type": "integer"
},
"user_id": {
"include_in_all": false,
"type": "integer"
}
}
}
}
}
The document:
{
"_index": "tasks_test",
"_type": "Task",
"_id": "1",
"_source": {
"is_private": false,
"name": "135548- test with number",
"project_id": 2,
"user_id": 1
}
}
The query:
{
"query": {
"filtered": {
"query": {
"bool": {
"must": [
[
{
"match": {
"_all": {
"query": "135548",
"type": "phrase_prefix"
}
}
}
]
]
}
},
"filter": {
"bool": {
"must": [
{
"term": {
"is_private": false
}
},
{
"terms": {
"project_id": [
2
]
}
},
{
"terms": {
"user_id": [
1
]
}
}
]
}
}
}
}
}
Also, some findings:
if I replace _all with name everything works
if I replace match_phrase_prefix with match_phrase works too
ES version: 1.5.1
So, the question is: how to make the query work for the second index without mentioned hacks?

Elasticsearch use custom analyzer on filter

I was asking on elasticsearch nested filter return empty result about some error I have in the query and wont getting any results, but in the answer I was pointed out that the expression I use for the filter wasn't analyzed as I expect.
I have a custom analyzer to do the work how can I specify in the next query to the filter to use this custom analyzer:
GET /develop/_search?search_type=dfs_query_then_fetch
{
"query": {
"filtered" : {
"query": {
"bool": {
"must": [
{ "match": { "title": "post" }}
]
}
},
"filter": {
"bool": {
"must": [
{"term": {
"featured": 0
}},
{
"nested": {
"path": "seller",
"filter": {
"bool": {
"must": [
{ "term": { "seller.firstName": "Test 3" } }
]
}
},
"_cache" : true
}}
]
}
}
}
},
"sort": [
{
"_score":{
"order": "desc"
}
},{
"created": {
"order": "desc"
}
}
],
"track_scores": true
}
Here is a setup that seems to do what you want. I used the same basic code as the last answer, but used index_analyzer and search_analyzer in the index definition as follows:
curl -XDELETE "http://localhost:9200/my_index"
curl -XPUT "http://localhost:9200/my_index" -d'
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"snowball": { "type": "snowball", "language": "English" },
"english_stemmer": { "type": "stemmer", "language": "english" },
"english_possessive_stemmer": { "type": "stemmer", "language": "possessive_english" },
"stopwords": { "type": "stop", "stopwords": [ "_english_" ] },
"worddelimiter": { "type": "word_delimiter" }
},
"tokenizer": {
"nGram": { "type": "nGram", "min_gram": 3, "max_gram": 20 }
},
"analyzer": {
"custom_analyzer": {
"type": "custom",
"tokenizer": "nGram",
"filter": [
"stopwords",
"asciifolding",
"lowercase",
"snowball",
"english_stemmer",
"english_possessive_stemmer",
"worddelimiter"
]
},
"custom_search_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"stopwords",
"asciifolding",
"lowercase",
"snowball",
"english_stemmer",
"english_possessive_stemmer",
"worddelimiter"
]
}
}
}
},
"mappings": {
"posts": {
"properties": {
"title": {
"type": "string",
"analyzer": "custom_analyzer",
"boost": 5
},
"seller": {
"type": "nested",
"properties": {
"firstName": {
"type": "string",
"index_analyzer": "custom_analyzer",
"search_analyzer": "custom_search_analyzer",
"boost": 3
}
}
}
}
}
}
}'
Then added the test docs
curl -XPUT "http://localhost:9200/my_index/posts/1" -d'
{"title": "post", "seller": {"firstName":"Test 1"}}'
curl -XPUT "http://localhost:9200/my_index/posts/2" -d'
{"title": "post", "seller": {"firstName":"Test 2"}}'
curl -XPUT "http://localhost:9200/my_index/posts/3" -d'
{"title": "post", "seller": {"firstName":"Test 3"}}'
And then a couple of match queries in a bool, where one is a multiword query, seems to accomplish what you are wanting:
curl -XPOST "http://localhost:9200/my_index/_search" -d'
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "post"
}
},
{
"nested": {
"path": "seller",
"query": {
"match": {
"seller.firstName": {
"query": "Test 3",
"operator": "and"
}
}
}
}
}
]
}
}
}'
...
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 6.8380365,
"hits": [
{
"_index": "my_index",
"_type": "posts",
"_id": "3",
"_score": 6.8380365,
"_source": {
"title": "post",
"seller": {
"firstName": "Test 3"
}
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/8cd954aa60be8c44f64e4282e15e6b565c945ecb
Does that solve your problem?

Resources