Elastic Search multi match query can't ignore special characters - node.js

I have a name field value as "abc_name" so when I search "abc_" I am getting proper results but when I search "abc_##£&-#&" still I am getting same results. I want my query to ignore this special characters that doesn't matches with my query.
My query has:
Multi_match
type as cross_fields
operator AND
I am using search_analyzer standard for my Fields
And I want this structure as it is otherwise it will affect my other Search behaviour
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "autocomplete",
"search_analyzer": "standard"
}

Please see the below sample which would fit your use case where I've created a custom analyzer which would fit your use case:
Sample Mapping:
PUT some_test_index
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"tokenizer": "custom_tokenizer",
"filter": ["lowercase", "3_5_edge_ngram"]
}
},
"tokenizer": {
"custom_tokenizer": {
"type": "pattern",
"pattern": "\\w+_+[^a-zA-Z\\d\\s_]+|\\s+". <---- Note this pattern
}
},
"filter": {
"3_5_edge_ngram": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 5
}
}
}
},
"mappings": {
"properties": {
"my_field":{
"type": "text",
"analyzer": "my_custom_analyzer"
}
}
}
}
The above mentioned pattern would simply ignore the tokens with the format like abc_$%^^##. As a result this token would not be indexed.
Note that the way the analyzer works is:
First executes tokenizer
Then applies the edge_ngram filter on the tokens generated.
You can verify by simply removing the edge_ngram filter in the above mapping to first understand what tokens are getting generated via Analyze API which would be as below:
POST some_test_index/_analyze
{
"analyzer": "my_custom_analyzer",
"text": "abc_name asda efg_!##!## 1213_adav"
}
Tokens generated:
{
"tokens" : [
{
"token" : "abc_name",
"start_offset" : 0,
"end_offset" : 8,
"type" : "word",
"position" : 0
},
{
"token" : "asda",
"start_offset" : 9,
"end_offset" : 13,
"type" : "word",
"position" : 1
},
{
"token" : "1213_adav",
"start_offset" : 25,
"end_offset" : 34,
"type" : "word",
"position" : 2
}
]
}
Note that the token efg_!##!## has been removed.
I've added edge_ngram fitler as you would want the search to be successful if you search with abc_ if your tokens generated via tokenizer is abc_name.
Sample Document:
POST some_test_index/_doc/1
{
"my_field": "abc_name asda efg_!##!## 1213_adav"
}
Query Request:
Use-case 1:
POST some_test_index/_search
{
"query": {
"match": {
"my_field": "abc_"
}
}
}
Use-case-2:
POST some_test_index/_search
{
"query": {
"match": {
"my_field": "efg_!##!##"
}
}
}
Responses:
Response for use-case-1:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1,
"relation" : "eq"
},
"max_score" : 0.47992462,
"hits" : [
{
"_index" : "some_test_index",
"_type" : "_doc",
"_id" : "1",
"_score" : 0.47992462,
"_source" : {
"my_field" : "abc_name asda efg_!##!## 1213_adav"
}
}
]
}
}
Response for use-case-2:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
Updated Answer:
Create your mapping as follows based on the index I've created and let me know if that works:
PUT some_test_index
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer": {
"type": "custom",
"tokenizer": "punctuation",
"filter": ["lowercase"]
}
},
"tokenizer": {
"punctuation": {
"type": "pattern",
"pattern": "\\w+_+[^a-zA-Z\\d\\s_]+|\\s+"
}
}
}
},
"mappings": {
"properties": {
"my_field":{
"type": "text",
"analyzer": "autocompete", <----- Assuming you have already this in setting
"search_analyzer": "my_custom_analyzer". <----- Note this
}
}
}
}
Please try and let me know if this works for all your use-cases.

Related

how to match a related data if incorrectly texted a keyword in elastic search

I have a document contain title with "Hard work & Success". I need to do a search for this document. And if I typed "Hardwork" (without spacing) it didn't returning any value. but if I typed "hard work" then it is returning the document.
this is the query I have used :
const search = qObject.search;
const payload = {
from: skip,
size: limit,
_source: [
"id",
"title",
"thumbnailUrl",
"youtubeUrl",
"speaker",
"standards",
"topics",
"schoolDetails",
"uploadTime",
"schoolName",
"description",
"studentDetails",
"studentId"
],
query: {
bool: {
must: {
multi_match: {
fields: [
"title^2",
"standards.standard^2",
"speaker^2",
"schoolDetails.schoolName^2",
"hashtags^2",
"topics.topic^2",
"studentDetails.studentName^2",
],
query: search,
fuzziness: "AUTO",
},
},
},
},
};
if I searched for title "hard work" (included space)
then it returns data like this:
"searchResults": [
{
"_id": "92",
"_score": 19.04531,
"_source": {
"standards": {
"standard": "3",
"categoryType": "STANDARD",
"categoryId": "S3"
},
"schoolDetails": {
"categoryType": "SCHOOL",
"schoolId": "TPS123",
"schoolType": "PUBLIC",
"logo": "91748922mn8bo9krcx71.png",
"schoolName": "Carmel CMI Public School"
},
"studentDetails": {
"studentId": 270,
"studentDp": "164646972124244.jpg",
"studentName": "Nelvin",
"about": "good student"
},
"topics": {
"categoryType": "TOPIC",
"topic": "Motivation",
"categoryId": "MY"
},
"youtubeUrl": "https://www.youtube.com/watch?v=wermQ",
"speaker": "Anna Maria Siby",
"description": "How hardwork leads to success - motivational talk by Anna",
"id": 92,
"uploadTime": "2022-03-17T10:59:59.400Z",
"title": "Hard work & Success",
}
},
]
And if i search for the Keyword "Hardwork" (without spacing) it won't detecting this data. I need to make a space in it or I need to match related datas with the searching keyword. Is there any solution for this can you please help me out of this.
I made an example using a shingle analyzer.
Mapping:
{
"settings": {
"analysis": {
"filter": {
"shingle_filter": {
"type": "shingle",
"max_shingle_size": 4,
"min_shingle_size": 2,
"output_unigrams": "true",
"token_separator": ""
}
},
"analyzer": {
"shingle_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"shingle_filter"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "shingle_analyzer"
}
}
}
}
Now I tested it with your term. Note that the token "hardwork" was generated but the others were also generated which may be a problem for you.
GET idx-separator-words/_analyze
{
"analyzer": "shingle_analyzer",
"text": ["Hard work & Success"]
}
Results:
{
"tokens" : [
{
"token" : "hard",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "hardwork",
"start_offset" : 0,
"end_offset" : 9,
"type" : "shingle",
"position" : 0,
"positionLength" : 2
},
{
"token" : "hardworksuccess",
"start_offset" : 0,
"end_offset" : 19,
"type" : "shingle",
"position" : 0,
"positionLength" : 3
},
{
"token" : "work",
"start_offset" : 5,
"end_offset" : 9,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "worksuccess",
"start_offset" : 5,
"end_offset" : 19,
"type" : "shingle",
"position" : 1,
"positionLength" : 2
},
{
"token" : "success",
"start_offset" : 12,
"end_offset" : 19,
"type" : "<ALPHANUM>",
"position" : 2
}
]
}

Is there any solution for searching exact word and containing word both in elasticsearch

index: process.env.elasticSearchIndexName,
body: {
query: {
bool: {
must: [
{
match_phrase: {
title: `${searchKey}`,
},
},
],
},
},
},
from: (page || constants.pager.page),
size: (limit || constants.pager.limit),
i am using above method but problem in that is it only search exact matched words in whole text.
it can't search containing word.. for example if title = "sweatshirt" than if i type word "shirt" it should come the result but currently not got the result using above method
Standard analyzer(default analyzer if none is specified) breaks texts in tokens.
For sentence "this is a test" tokens generated are [this,is,a,test]
Match_pharse query breaks text in tokens using same analyzer as indexing analyzer and returns documents which 1. contain all the tokens 2. tokens appear in same order.
Since you text is sweatshirt there is single token in inverted index for it "sweatshirt" which will not match with either sweat or shirt
NGram tokenizer
The ngram tokenizer first breaks text down into words whenever it encounters one of a list of specified characters, then it emits N-grams of each word of the specified length
Mapping
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer"
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3,
"token_chars": [
"letter",
"digit"
]
}
}
}
},
"mappings": {
"properties": {
"text":{
"type": "text",
"analyzer": "my_analyzer"
}
}
}
}
Query:
{
"query": {
"match": {
"text": "shirt"
}
}
}
If you will run _analyze query
GET my_index/_analyze
{
"text": ["sweatshirt"],
"analyzer": "my_analyzer"
}
you will see below token are generated for the text sweatshirt. Size of tokens can be adjusted using min_gram and max_gram
{
"tokens" : [
{
"token" : "swe",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 0
},
{
"token" : "wea",
"start_offset" : 1,
"end_offset" : 4,
"type" : "word",
"position" : 1
},
{
"token" : "eat",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 2
},
{
"token" : "ats",
"start_offset" : 3,
"end_offset" : 6,
"type" : "word",
"position" : 3
},
{
"token" : "tsh",
"start_offset" : 4,
"end_offset" : 7,
"type" : "word",
"position" : 4
},
{
"token" : "shi",
"start_offset" : 5,
"end_offset" : 8,
"type" : "word",
"position" : 5
},
{
"token" : "hir",
"start_offset" : 6,
"end_offset" : 9,
"type" : "word",
"position" : 6
},
{
"token" : "irt",
"start_offset" : 7,
"end_offset" : 10,
"type" : "word",
"position" : 7
}
]
}
Warning:Ngrams increase the size of the inverted index so use with appropriate value of min_gram and max_gram
Another option is to use wildcard query. For wildcard all the documents have to scanned to check if text matches the pattern. They have low performance.
When using wildcard search on not_analyzed fields in case you want to include whitespace ex text.keyword
{
"query": {
"wildcard": {
"text": {
"value": "*shirt*"
}
}
}
}

No results match your search criteria in kibana with timestamp

I created an index in elasticsearch 6.5.1 successfully loaded the data to that index. there is one field "submitted_date" which is the timestamp. below is the mapping like of this field.
"submitted_date": { "type": "date", "format":"yyyy-MM-dd HH:mm:ss.SSS" },
then I created the index pattern. I used the Time Filter field name as "submitted_date". after that, I tried to check the data in Discover tab, but data are not showing. there is a message saying that No results match your search criteria.
NOTE that I have changed the time in time range picker in every possible way which is on top of the right corner in kibana dashboard.
data appear in Dev Tools tab with elastic queries.
ps : I inserted the data using nodejs with elasticsearch official library, did not used logstash.
I followed this article, but it did not help me.
UPDATE : sample document
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 10480,
"max_score" : 1.0,
"hits" : [
{
"_index" : "test",
"_type" : "tests",
"_id" : "1214334",
"_score" : 1.0,
"_source" : {
"priority" : "4",
"submitted_date" : "2018-01-04T18:32:21.000Z",
"submitted_month" : 0,
"submitted_month_name" : "January",
"submitted_day" : 4,
"submitted_weekday" : "Tuesday",
"submitted_hour" : 18,
"submitted_year_month" : "2018-0",
"submitted_year_month_name" : "2018-January",
"date_key" : "20180104",
"year_month_key" : "201801",
"status" : "Closed"
}
}
]
}
}
Inspect request
{
"version": true,
"size": 500,
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"_source": {
"excludes": []
},
"aggs": {
"2": {
"date_histogram": {
"field": "submitted_date",
"interval": "1d",
"time_zone": "Asia/Kolkata",
"min_doc_count": 1
}
}
},
"stored_fields": [
"*"
],
"script_fields": {},
"docvalue_fields": [
{
"field": "close_date",
"format": "date_time"
},
{
"field": "last_modified_date",
"format": "date_time"
},
{
"field": "last_resolved_date",
"format": "date_time"
},
{
"field": "submitted_date",
"format": "date_time"
},
{
"field": "time_to_resolve",
"format": "date_time"
}
],
"query": {
"bool": {
"must": [
{
"match_all": {}
},
{
"range": {
"submitted_date": {
"gte": 1514745000000,
"lte": 1543937620414,
"format": "epoch_millis"
}
}
}
],
"filter": [],
"should": [],
"must_not": []
}
},
"highlight": {
"pre_tags": [
"#kibana-highlighted-field#"
],
"post_tags": [
"#/kibana-highlighted-field#"
],
"fields": {
"*": {}
},
"fragment_size": 2147483647
}
}
Index pattern
function _putMapping() {
return client.indices.create({
index: process.env.ELASTICSEARCH_INDEX,
body: {
settings:{
index:{
"number_of_shards": 1,
"number_of_replicas": 5
},
"index.mapping.ignore_malformed" : true
},
mappings:{
tests:{
properties:{
"last_modified_date": { "type": "date" },
"last_resolved_date": { "type": "date" },
"time_to_resolve": { "type": "date" },
"submitted_date": { "type": "date", "format":"yyyy-MM-dd HH:mm:ss.SSS" },
"date_key": { "type": "integer" },
"priority": { "type": "long" },
"submitted_hour": { "type": "long" },
"submitted_month": { "type": "long" },
"submitted_year": { "type": "long" },
"submitted_year": { "type": "keyword" },
"submitted_year_month": { "type": "keyword" },
"submitted_year_month_name": { "type": "keyword" },
}
}
}
}
});
}
Your mYour submitted_date is coming like 2018-01-04T18:32:21.000Z but your mapping is set as yyyy-MM-dd HH:mm:ss.SSS.
You need to change it to "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'".

Elasticsearch aggrecation give me 2 results insted of one result

I want to aggregate on the brand field and is give me two results instead of one
The brands_aggs give me from this text
{name : "Brand 1"}
2 results
Brand and 1
But Why I need only Brand 1
is separate the word brand and 1 from (Brand 1)
and is give me 2 results in the aggrecation
my mappings where I want to aggregate
mapping = {
"mappings": {
"product": {
"properties": {
"categories": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"fielddata": True
}
"brand": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"fielddata": True
}
}
}
}
}
my post request
{
"query" : {
"bool": {
"must": [
{"match": { "categories": "AV8KW5Wi31qHZdVeXG4G" }}
]
}
},
"size" : 0,
"aggs" : {
"brand_aggs" : {
"terms" : { "field" : "brand" }
},
"categories_aggs" : {
"terms" : { "field" : "categories" }
}
}
}
response from the server
{
"took": 18,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0,
"hits": []
},
"aggregations": {
"categories_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "av8kw5wi31qhzdvexg4g",
"doc_count": 1
},
{
"key": "av8kw61c31qhzdvexg4h",
"doc_count": 1
},
{
"key": "av8kxtch31qhzdvexg4a",
"doc_count": 1
}
]
},
"brand_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "1", <==== I dont need this , why is give me that ??
"doc_count": 1
},
{
"key": "brand",
"doc_count": 1
}
]
},
}
}
Your mapping has property fields which is used when you want to have multiple analyzers for the same field. In your case valid name of your field is 'brand.keyword'. When you call your aggregate for just 'brand' it use default mapping defined for string.
So your query should be:
{
"query" : {
"bool": {
"must": [
{"match": { "categories": "AV8KW5Wi31qHZdVeXG4G" }}
]
}
},
"size" : 0,
"aggs" : {
"brand_aggs" : {
"terms" : { "field" : "brand.keyword" }
},
"categories_aggs" : {
"terms" : { "field" : "categories.keyword" }
}
}
}
Property field is useful when you want for example search the same property which multiple analyzers, for example:
"full_name": {
"type": "text",
"analyzer": "standard",
"boost": 1,
"fields": {
"autocomplete": {
"type": "text",
"analyzer": "ngram_analyzer"
},
"standard":{
"type": "text",
"analyzer": "standard"
}
}
},
You need to map your string as not_analyzed string, for that run the below query
PUT your_index/_mapping/your_type
{
"your_type": {
"properties": {
"brand": {
"type": "string",
"index": "analyzed",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
Don't forget to replace the your_type and your_index with your type and index values.

Search query to retrieve nested documents in elasticsearch with _source disabled

I have the following mapping
{
"cloth": {
"dynamic" : false,
"_source" : {"enabled" : false },
"properties": {
"name": {
"type": "string",
"index": "analyzed"
},
"variation": {
"type": "nested",
"properties": {
"size": {
"type": "string",
"index": "not_analyzed"
},
"color": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
I am not able to figure out a way to retrieve the nested object fields using the fields query.
{
"fields" : ["name" , "variation.size", "variation.color"],
"query" : {
"nested" : {
"path" : "variation",
"query" : {
"bool" : {
"must" : [
{ "term" : { "variation.size" : "XXL" } },
{ "term" : { "variation.color" : "red" } }
]
}
}
}
}
}
The above query returns
"_id" : "1",
"_score" : 1.987628,
"fields" : {
"variation.size" : [ "XXL", "XL" ],
"variation.color" : [ "red", "black" ],
"name" : [ "Test shirt" ]
}
When I tried
"fields" : ["name" , "variation"]
I got the error
status: 400
reason: "ElasticsearchIllegalArgumentException[field [variation] isn't a leaf field]"
Which is as expected.
How can I get the variation object as it is?
Expected Result. I need to retrieve the variable object as a whole so that I can preserve the association of size and color. Like "red" with "XXL".
"variation" : { "XXL" , "red"}
Update: Source is disabled for this Index Type.
If you use Source Filtering it will return the nested objects as a whole, your query would be:
{
"_source": [
"name",
"variation"
],
"query": {
"nested": {
"path": "variation",
"query": {
"bool": {
"must": [
{
"term": {
"variation.size": "XXL"
}
},
{
"term": {
"variation.color": "red"
}
}
]
}
}
}
}
}
You should use this:
"script_fields": {
"variation": {
"script": {
"inline": "doc['variation.size'].value + ' ' + doc['variation.red'].value"
}
}
}
I use elasticsearch v. 5.1.1

Resources