ElasticSearch CouchDB river - explicitly specify field type - couchdb

I am using ElasticSearch river to index a CouchDB database of tweets.
The "created_at" field doesn't conform to the "date" type and gets indexed as a String.
How would I start a river with explicitly specifying that "created_at" is a Date, so that I could do range queries on it?
I tried the following river request, but it didn't work and the field was still indexed as a String:
curl -XPUT 'localhost:9200/_river/my_db/_meta' -d '{
"type" : "couchdb",
"couchdb" : {
"host" : "localhost",
"port" : 5984,
"db" : "testtweets",
"filter" : null
},
"index" : {
"index" : "my_testing",
"type" : "my_datetesting",
"properties" : {"created_at": {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss"
}
},
"bulk_size" : "100",
"bulk_timeout" : "10ms"
}
}'
My data looks like this:
{
"_id": "262856000481136640",
"_rev": "1-0ed7c0fe655974e236814184bef5ff16",
"contributors": null,
"truncated": false,
"text": "RT #edoswald: Ocean City MD first to show that #Sandy is no joke. Pier badly damaged, sea nearly topping the seawall http://t.co/D0Wwok4 ...",
"author_name": "Casey Strader",
"author_created_at": "2011-04-21 20:00:32",
"author_description": "",
"author_location": "",
"author_geo_enabled": false,
"source": "Twitter for iPhone",
"retweeted": false,
"coordinates": null,
"author_verified": false,
"entities": {
"user_mentions": [
{
"indices": [
3,
12
],
"id_str": "10433822",
"id": 10433822,
"name": "Ed Oswald",
"screen_name": "edoswald"
}
],
"hashtags": [
{
"indices": [
47,
53
],
"text": "Sandy"
}
],
"urls": [
{
"indices": [
117,
136
],
"url": "http://t.co/D0Wwok4",
"expanded_url": "http://t.co/D0Wwok4",
"display_url": "t.co/D0Wwok4"
}
]
},
"in_reply_to_screen_name": null,
"author_id_str": "285792303",
"retweet_count": 98,
"id_str": "262856000481136640",
"favorited": false,
"source_url": "http://twitter.com/download/iphone",
"author_screen_name": "Casey_Rae22",
"geo": null,
"in_reply_to_user_id_str": null,
"author_time_zone": "Eastern Time (US & Canada)",
"created_at": "2012-10-29 09:58:48",
"in_reply_to_status_id_str": null,
"place": null
}
Thanks!

Related

how to match a related data if incorrectly texted a keyword in elastic search

I have a document contain title with "Hard work & Success". I need to do a search for this document. And if I typed "Hardwork" (without spacing) it didn't returning any value. but if I typed "hard work" then it is returning the document.
this is the query I have used :
const search = qObject.search;
const payload = {
from: skip,
size: limit,
_source: [
"id",
"title",
"thumbnailUrl",
"youtubeUrl",
"speaker",
"standards",
"topics",
"schoolDetails",
"uploadTime",
"schoolName",
"description",
"studentDetails",
"studentId"
],
query: {
bool: {
must: {
multi_match: {
fields: [
"title^2",
"standards.standard^2",
"speaker^2",
"schoolDetails.schoolName^2",
"hashtags^2",
"topics.topic^2",
"studentDetails.studentName^2",
],
query: search,
fuzziness: "AUTO",
},
},
},
},
};
if I searched for title "hard work" (included space)
then it returns data like this:
"searchResults": [
{
"_id": "92",
"_score": 19.04531,
"_source": {
"standards": {
"standard": "3",
"categoryType": "STANDARD",
"categoryId": "S3"
},
"schoolDetails": {
"categoryType": "SCHOOL",
"schoolId": "TPS123",
"schoolType": "PUBLIC",
"logo": "91748922mn8bo9krcx71.png",
"schoolName": "Carmel CMI Public School"
},
"studentDetails": {
"studentId": 270,
"studentDp": "164646972124244.jpg",
"studentName": "Nelvin",
"about": "good student"
},
"topics": {
"categoryType": "TOPIC",
"topic": "Motivation",
"categoryId": "MY"
},
"youtubeUrl": "https://www.youtube.com/watch?v=wermQ",
"speaker": "Anna Maria Siby",
"description": "How hardwork leads to success - motivational talk by Anna",
"id": 92,
"uploadTime": "2022-03-17T10:59:59.400Z",
"title": "Hard work & Success",
}
},
]
And if i search for the Keyword "Hardwork" (without spacing) it won't detecting this data. I need to make a space in it or I need to match related datas with the searching keyword. Is there any solution for this can you please help me out of this.
I made an example using a shingle analyzer.
Mapping:
{
"settings": {
"analysis": {
"filter": {
"shingle_filter": {
"type": "shingle",
"max_shingle_size": 4,
"min_shingle_size": 2,
"output_unigrams": "true",
"token_separator": ""
}
},
"analyzer": {
"shingle_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"shingle_filter"
]
}
}
}
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "shingle_analyzer"
}
}
}
}
Now I tested it with your term. Note that the token "hardwork" was generated but the others were also generated which may be a problem for you.
GET idx-separator-words/_analyze
{
"analyzer": "shingle_analyzer",
"text": ["Hard work & Success"]
}
Results:
{
"tokens" : [
{
"token" : "hard",
"start_offset" : 0,
"end_offset" : 4,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "hardwork",
"start_offset" : 0,
"end_offset" : 9,
"type" : "shingle",
"position" : 0,
"positionLength" : 2
},
{
"token" : "hardworksuccess",
"start_offset" : 0,
"end_offset" : 19,
"type" : "shingle",
"position" : 0,
"positionLength" : 3
},
{
"token" : "work",
"start_offset" : 5,
"end_offset" : 9,
"type" : "<ALPHANUM>",
"position" : 1
},
{
"token" : "worksuccess",
"start_offset" : 5,
"end_offset" : 19,
"type" : "shingle",
"position" : 1,
"positionLength" : 2
},
{
"token" : "success",
"start_offset" : 12,
"end_offset" : 19,
"type" : "<ALPHANUM>",
"position" : 2
}
]
}

API Request within another API request (Same API) in Python

I currently have made a python program, request JSON data from an API. Now here is the thing though this JSON actually contains other request Urls to get extra data from that object.
import requests
import json
import sys
import os
import geojson
response = requests.get("http://api.gipod.vlaanderen.be/ws/v1/workassignment", params = {"CRS": "Lambert72"})
print(response.status_code)
text = json.dumps(response.json(),sort_keys=True, indent=4)
print(text)
f = open("text.json", "wt")
f.write(text)
print(os.getcwd())
JSON from request, the other request URLs including parameters is in the detail column.
[
{
"gipodId": 103246,
"owner": "Eandis Leuven",
"description": ", , ZAVELSTRAAT: E Nieuw distributienet (1214m)",
"startDateTime": "2007-12-03T06:00:00",
"endDateTime": "2014-01-06T19:00:00",
"importantHindrance": false,
"coordinate": {
"coordinates": [
4.697028256276443,
50.896894135898485
],
"type": "Point",
"crs": {
"type": "name",
"properties": {
"name": "urn:ogc:def:crs:OGC:1.3:CRS84"
}
}
},
**"detail": http://api.gipod.vlaanderen.be/ws/v1/workassignment/103246?crs=4326,
"cities": ["Leuven"]**
}
],
"latestUpdate": "2016-11-16T11:32:39.253"
}
The first request just gets the points (each unique with a certain id), while the second request gets the "details data" which also has polygon data and multiline.
Get Url:
http://api.gipod.vlaanderen.be/ws/v1/workassignment/[id]
{ "comment" : null,
"contactDetails" : { "city" : "Leuven",
"country" : "België",
"email" : null,
"extraAddressInfo" : null,
"firstName" : null,
"lastName" : null,
"number" : "58",
"organisation" : "Eandis Leuven",
"phoneNumber1" : "078/35.35.34",
"phoneNumber2" : null,
"postalCode" : "3012",
"street" : "Aarschotsesteenweg"
},
"contractor" : null,
"mainContractor" : null,
"description" : ", , ZAVELSTRAAT: E Nieuw distributienet (1214m)",
"diversions" : [
{
"gipodId": 1348152,
"reference": "IOW-TERRAS-2013-01-Z",
"description": "Horecaterras op parkeerstrook (Lierbaan 12)",
"comment": null,
"geometry": {
"geometries": [
{
"coordinates": [[[3.212947654779088, 51.175784679668915],
[3.2151308569159482, 51.17366647833133],
[3.216112818368467, 51.17328051591839],
[3.2186926906668876, 51.173044950954456],
[3.2204789191276944, 51.173098278776514],
[3.221602856602255, 51.173333934695286]]],
"type": "MultiLineString",
"crs": null
}
],
"type": "GeometryCollection",
"crs": {
"type": "name",
"properties": {
"name": "urn:ogc:def:crs:OGC:1.3:CRS84"
}
}
},
"periods": [{"startDateTime": "2013-04-09T00:00:00","endDateTime": "2013-10-31T00:00:00"}],
"recurrencePattern": null,
"latestUpdate": "2014-01-24T10:23:08.917",
"streets": null,
"diversionTypes": null,
"diversionDirection":
{
"type": 0,
"description": "Beide"
},
"status": "Vergund",
"contactDetails": {
"organisation": "Café Real",
"lastName": "Vets",
"firstName": "Peggy",
"phoneNumber1": null,
"phoneNumber2": null,
"email": "peggy.vets#skynet.be",
"street": "Lierbaan",
"number": "12",
"postalCode": "2580",
"city": "Putte",
"country": "België",
"extraAddressInfo": null
}
"url": null,
}
],
"endDateTime" : "2014-01-06T19:00:00",
"gipodId" : 103246,
"hindrance" : { "description" : null,
"direction" : null,
"effects" : [ "Fietsers hebben doorgang",
"Handelaars bereikbaar",
"Verminderde doorstroming in 1 richting",
"Voetgangers op de rijweg",
"Voetgangers hebben doorgang"
],
"important" : false,
"locations" : [ "Voetpad" ]
},
"latestUpdate" : "2013-06-18T03:43:28.17",
"location" : { "cities" : [ "Leuven" ],
"coordinate" : { "coordinates" : [ 4.697028256276443,
50.896894135898485
],
"crs" : { "properties" : { "name" : "urn:ogc:def:crs:OGC:1.3:CRS84" },
"type" : "name"
},
"type" : "Point"
},
"geometry" : { "coordinates" : [ [ [ [ 4.699934331336474,
50.90431808607037
],
[ 4.699948535632464,
50.90431829749237
],
[ 4.699938837004092,
50.90458139231922
],
[ 4.6999246328435396,
50.90458118062111
],
[ 4.699934331336474,
50.90431808607037
]
] ]
],
"crs" : { "properties" : { "name" : "urn:ogc:def:crs:OGC:1.3:CRS84" },
"type" : "name"
},
"type" : "MultiPolygon"
}
},
"owner" : "Eandis Leuven",
"reference" : "171577",
"startDateTime" : "2007-12-03T06:00:00",
"state" : "In uitvoering",
"type" : "Werken aan nutsleiding",
"url" : "http://www.eandis.be"
}
Now here is the deal, this request has to be repeated for each object I get from the First API Request. And this can be over one hundred objects. So logic dictates this has to happen in a loop, though how to start is bit..troublesome.
You can make you of functions in this case.
Your first function can simply fetch the list of the points. Your second function can simply fetch the data of details.
def fetch_details(url: str):
""" Makes request call to get the data of detail """
response = requests.get(url)
# any other processe
def fetch_points(url: str):
response = requests.get(url)
for obj in response.json():
fetch_details(obj.get("detail"))
api_url = "api.gipod.vlaanderen.be/ws/v1/workassignment"
fetch_points(api_url)

Converting a MongoDB aggregate into an ArangoDB COLLECT

I'm migrating data from Mongo to Arango and I need to reproduce a $group aggregation. I have successfully reproduced the results but I'm concerned that my approach maybe sub-optimal. Can the AQL be improved?
I have a collection of data that looks like this:
{
"_id" : ObjectId("5b17f9d85b2c1998598f054e"),
"department" : [
"Sales",
"Marketing"
],
"region" : [
"US",
"UK"
]
}
{
"_id" : ObjectId("5b1808145b2c1998598f054f"),
"department" : [
"Sales",
"Marketing"
],
"region" : [
"US",
"UK"
]
}
{
"_id" : ObjectId("5b18083c5b2c1998598f0550"),
"department" : "Development",
"region" : "Europe"
}
{
"_id" : ObjectId("5b1809a75b2c1998598f0551"),
"department" : "Sales"
}
Note the value can be a string, Array or not present
In Mongo I'm using the following code to aggregate the data:
db.test.aggregate([
{
$unwind:{
path:"$department",
preserveNullAndEmptyArrays: true
}
},
{
$unwind:{
path:"$region",
preserveNullAndEmptyArrays: true
}
},
{
$group:{
_id:{
department:{ $ifNull: [ "$department", "null" ] },
region:{ $ifNull: [ "$region", "null" ] },
},
count:{$sum:1}
}
}
])
In Arango I'm using the following AQL:
FOR i IN test
LET FIELD1=(FOR a IN APPEND([],NOT_NULL(i.department,"null")) RETURN a)
LET FIELD2=(FOR a IN APPEND([],NOT_NULL(i.region,"null")) RETURN a)
FOR f1 IN FIELD1
FOR f2 IN FIELD2
COLLECT id={department:f1,region:f2} WITH COUNT INTO counter
RETURN {_id:id,count:counter}
Edit:
The APPEND is used to convert string values into an Array
Both produce results that look like this;
{
"_id" : {
"department" : "Marketing",
"region" : "US"
},
"count" : 2.0
}
{
"_id" : {
"department" : "Development",
"region" : "Europe"
},
"count" : 1.0
}
{
"_id" : {
"department" : "Sales",
"region" : "null"
},
"count" : 1.0
}
{
"_id" : {
"department" : "Marketing",
"region" : "UK"
},
"count" : 2.0
}
{
"_id" : {
"department" : "Sales",
"region" : "UK"
},
"count" : 2.0
}
{
"_id" : {
"department" : "Sales",
"region" : "US"
},
"count" : 2.0
}
Your approach seems alright. I would suggest to use TO_ARRAY() instead of APPEND() to make it easier to understand though.
Both functions skip null values, thus it is unavoidable to provide some placeholder, or test for null explicitly and return an array with a null value (or whatever works best for you):
FOR doc IN test
FOR field1 IN doc.department == null ? [ null ] : TO_ARRAY(doc.department)
FOR field2 IN doc.region == null ? [ null ] : TO_ARRAY(doc.region)
COLLECT department = field1, region = field2
WITH COUNT INTO count
RETURN { _id: { department, region }, count }
Collection test:
[
{
"_key": "5b17f9d85b2c1998598f054e",
"department": [
"Sales",
"Marketing"
],
"region": [
"US",
"UK"
]
},
{
"_key": "5b18083c5b2c1998598f0550",
"department": "Development",
"region": "Europe"
},
{
"_key": "5b1808145b2c1998598f054f",
"department": [
"Sales",
"Marketing"
],
"region": [
"US",
"UK"
]
},
{
"_key": "5b1809a75b2c1998598f0551",
"department": "Sales"
}
]
Result:
[
{
"_id": {
"department": "Development",
"region": "Europe"
},
"count": 1
},
{
"_id": {
"department": "Marketing",
"region": "UK"
},
"count": 2
},
{
"_id": {
"department": "Marketing",
"region": "US"
},
"count": 2
},
{
"_id": {
"department": "Sales",
"region": null
},
"count": 1
},
{
"_id": {
"department": "Sales",
"region": "UK"
},
"count": 2
},
{
"_id": {
"department": "Sales",
"region": "US"
},
"count": 2
}
]

how to create a query with geolocation in mongoose (use for search by near places)

I'm quite new in mongodb and mongoose. I don't know if my query is working but when I add some geojson to my code it returns null.
My only target is can filter my data using state,country and state and also search nearby places. It would be really great help if someone can help me. Thanks
var query = {
$and : [
{city : new RegExp('^'+req.body.city+'$', "i") },
{state : req.body.state},
{country : req.body.country},
{
loc : {
$nearSphere : {
$geometry : {
type : "Point",
coordinates : [-117.16108380000003,32.715738]
},
$maxDistance : 100
}
}
}
Business.find(query).populate('deal_id').sort({business_type : -1,deal_id : -1})
.exec(function(err,businesses){
res.json(businesses)
return
})
I don't know if im doing it right, here's my sample data:
[
{
"_id": "5a0b1f489929442c36fd5c83",
"business_row": 29160,
"created_at": "2017-11-14T16:52:10.130Z",
"owner_name": "David Lui",
"company_website": "",
"phone_number": "604-273-3288",
"contact_name": "David Lui",
"zip_postal": "V6X 3Z9",
"state": "British Columbia",
"country": "Canada",
"city": "Richmond",
"address": "3779 Sexsmith Rd # 2172 Richmond British Columbia",
"company_name": "Aem Seafood",
"__v": 1,
"slug": "Aem-Seafood&Richmond",
"loc": {
"coordinates": [
"-123.129488",
"49.185359"
],
"type": "Point"
},
"deal_id": [],
"is_favorite": false,
"is_draft": false,
"has_featured": false,
"owner_id": [
"5a0adcf9f7205f0004535def"
],
"files": [],
"operations": [],
"sub_category": [],
"category_options": [
{
"value": "5a0b186b9f3a4a2710075654",
"sub_cat": {
"value": "59f6d13d00086a6e645c50a4",
"label": "Meat And Fish Markets"
}
}
],
"category_id": [
"5a0b186b9f3a4a2710075654"
],
"business_type_name": "Free",
"business_type": "0",
"user_id": [
"5a0adcf9f7205f0004535def"
]
}
]
turns out i don't need to query the city,state and country for it and use $geoWithIn

Elastic search with CouchDB river plugin - Can't find any documents

I recently started using elasticsearch and couchdb and I have the following problem. I have a couch database with a bunch of documents. I add a couchDb river index on elasticsearch and I expect to have those documents indexed and searchable. But when I search for anything though ES I don't get any results. The command flow is as follows:
The command above verifies that there are 4 documents in the couchDb instance
curl -H "Content-Type: application/json" -X GET http://localhost:5984/my_db
result:
{
"db_name": "my_db",
"doc_count": 4,
"doc_del_count": 0,
"update_seq": 4,
"purge_seq": 0,
"compact_running": false,
"disk_size": 16482,
"data_size": 646,
"instance_start_time": "1370204643908592",
"disk_format_version": 6,
"committed_update_seq": 4
}
The _changes output:
curl -H "Content-Type: application/json" -X GET http://localhost:5984/my_db/_changes
{
"results": [
{
"seq": 1,
"id": "1",
"changes": [
{
"rev": "1-40d928a959dd52d183ab7c413fabca92"
}
]
},
{
"seq": 2,
"id": "2",
"changes": [
{
"rev": "1-42212757a56b240f5205266b1969e890"
}
]
},
{
"seq": 3,
"id": "3",
"changes": [
{
"rev": "1-f59c2ae7acacb68d9414be05d56ed33a"
}
]
},
{
"seq": 4,
"id": "4",
"changes": [
{
"rev": "1-e86cf1c287c16906e81d901365b9bf98"
}
]
}
],
"last_seq": 4
}
Now, below I m creating my index in ES.
curl -XPUT 'http://localhost:9200/_river/my_db/_meta' -d '{
"type": "couchdb",
"couchdb": {
"host": "localhost",
"port": 5984,
"db": "my_db",
"filter": null
}
}'
{
"ok": true,
"_index": "_river",
"_type": "my_db",
"_id": "_meta",
"_version": 1
}
But I don't get anything back.
curl -XGET "http://localhost:9200/my_db/my_db/_search?pretty=true"
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : []
}
}
Is there anything I'm missing?
You're missing the ElasticSearch index settings from your river metadata. From here:
{
"type" : "couchdb",
"couchdb" : {
"host" : "localhost",
"port" : 5984,
"db" : "my_db",
"filter" : null
},
"index" : {
"index" : "my_db",
"type" : "my_db",
"bulk_size" : "100",
"bulk_timeout" : "10ms"
}
}
I haven't seen any documentation that suggests the "index" member can be inferred.

Resources