Limiting amount of async BigQuery Jobs running on express server - node.js

I have an express server that is pulling data from Google BigQuery. An array of objects is provided. I want to pull sales data for each store in a district, but the table holds sales information based only on store and doesn't have district information. I was accomplishing this by sending one query per district, but once the array has more than 50 district I get errors. The results are stored in individual CSV files by district. So it is convenient to send individual queries and dump it into the CSVs. BigQuery only allows 50 jobs at a given time. I am looking for the best way to adapt the below code to call asyncQuery(query) 50 times then only make the next call when a previous call has returned. I have been trying to work the job status using the job.getMetadata() but no luck yet.
Thanks for any help you can offer
const array = [{
district: "north"
store: "1001,1002"
},
{
district: "south"
store: "1003"
},{
district: "west"
store: "1004"
}
]
function apiCall(array) {
array.forEach(element => {
let stores = element.store.toString()
let query = `SELECT store, sku, tot_sales, price
FROM big-query-table
WHERE
store IN (${stores})`
asyncQuery(query)
.then(resp => {
console.log(resp)
}).catch(err => {
console.error('ERROR:', err);
})
})
return "Running Jobs"
}
function asyncQuery(sqlQuery) {
const options = {
query: sqlQuery,
useLegacySql: false,
};
let job;
return bigquery
.createQueryJob(options)
.then(results => {
job = results[0];
console.log(`Job ${job.id} started.`);
return job.promise();
})
.then(() => {
// Get the job's status
return job.getMetadata();
})
.then(metadata => {
// Check the job's status for errors
const errors = metadata[0].status.errors;
if (errors && errors.length > 0) {
throw errors;
}
})
.then(() => {
console.log(`Job ${job.id} completed.`);
return job.getQueryResults();
})
.then(results => {
const rows = results[0];
return rows;
})
.catch(err => {
console.error('ERROR:', err);
});
}

With BigQuery - and any other columnar analytical database - you really want to avoid doing 50 queries like:
[*50] SELECT * FROM big-query-table
WHERE storeNumber = ${StoreNumber}
Instead, the best you could do is one query, specifying the columns you are looking for, and all the ids you're looking for:
SELECT col1, col2, col3
FROM big-query-table
WHERE storeNumber IN ('id1', 'id2', ..., 'id50')
Or a straight join:
SELECT col1, col2, col3
FROM big-query-table
WHERE storeNumber IN (SELECT store_id FROM `table`)
Then you won't need to send 50 concurrent queries, and you'll get result in less time and lower cost.

Related

Two Knex queries (node.js) and one of it - in forEach loop

One table is UserChats with list of chats-to-users.
The other table is ChatList with all chats.
I need to build an array of chats of a specific user into variable and return it.
_DB("UserChats").select('*').where({
uID: user_id
}).then(chats => {
const userChats = []
chats.forEach(chat => {
_DB('ChatList').where({
id: chat.chatID
}).select('*').then(chat_data => {
userChats.push(chat_data);
});
});
return userChats; // but this variable is empty
});
How can I elegantly rewrite the code so that all the code is executed?
The join query will be as follows:
_DB('UserChats')
.select('*')
.where({ "UserChats.uID": user_id })
.join('ChatList', 'UserChats.chatID', 'ChatList.id')
.then(function(rows) {
return rows;
});

Sequelize - How to Insert data, and then update

I'm using Sequalize ORM for node.js/maria db project.
What I'm trying to do is, generate new table for product data using raw query.
The sequence of my logic is written below.
Step 1. Destroy table to reset data.
Step 2. Insert product data.
Step 3. Update price data in product data.
Step 4. Update stock data in product data.
The problem is step 3, and 4. It is not working!
What I found is... 'Insert' took some time to finish. So, 'Update' could not fulfilled, because there's no product data yet.
Is there any idea to invoke step 3~4, soon after step 2 is finished?
Thanks in advance.
const generateProductList = () => {
return new Promise( async (resolve, reject) => {
try {
await ProductPresentation.destroy({ truncate: true })
const productSql = `INSERT INTO m_product_presentation (productId, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail)
SELECT id, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail FROM m_product;`
const priceSql = `UPDATE m_product_presentation INNER JOIN m_price
ON m_product_presentation.productId = m_price.productId
SET m_product_presentation.priceRrp = m_price.priceRrp, m_product_presentation.priceRegular = m_price.priceRegular, m_product_presentation.priceSpecial = m_price.priceSpecial;`
const stockSql = `UPDATE m_product_presentation INNER JOIN m_inventory
ON m_product_presentation.productId = m_inventory.productId
SET m_product_presentation.stockAvailability = m_inventory.stockAvailability, m_product_presentation.stockQty = m_inventory.stockQty;`
// What I want is, Create initial data first. And then, update price and stock info. But, It fail...
await ProductPresentation.sequelize.query(productSql, { type: QueryTypes.INSERT })
await ProductPresentation.sequelize.query(priceSql, { type: QueryTypes.UPDATE })
await ProductPresentation.sequelize.query(stockSql, { type: QueryTypes.UPDATE })
resolve()
} catch(err) {
reject(err)
logger.error(err)
}
})
}
What you can do is that before updating the values which is dependent on the same product, you can check whether the product is already entered or not.
const generateProductList = () => {
return new Promise(async (resolve, reject) => {
try {
await ProductPresentation.destroy({ truncate: true })
const productSql = `INSERT INTO m_product_presentation (productId, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail)
SELECT id, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail FROM m_product;`
const priceSql = `UPDATE m_product_presentation INNER JOIN m_price
ON m_product_presentation.productId = m_price.productId
SET m_product_presentation.priceRrp = m_price.priceRrp, m_product_presentation.priceRegular = m_price.priceRegular, m_product_presentation.priceSpecial = m_price.priceSpecial;`
const stockSql = `UPDATE m_product_presentation INNER JOIN m_inventory
ON m_product_presentation.productId = m_inventory.productId
SET m_product_presentation.stockAvailability = m_inventory.stockAvailability, m_product_presentation.stockQty = m_inventory.stockQty;`
// Inserting the product in the table
const product = await ProductPresentation.sequelize.query(productSql, { type: QueryTypes.INSERT })
// To find if the product is existing the table or not
const isProductEntered = await sequelize.query(`
select exists(select 1 from "m_product_presentation" where "productId"=$1)
`, {
bind: [`${product[0][0].productId}`],
type: QueryTypes.SELECT
});
if (isProductEntered[0].exists) {
await ProductPresentation.sequelize.query(priceSql, { type: QueryTypes.UPDATE })
await ProductPresentation.sequelize.query(stockSql, { type: QueryTypes.UPDATE })
}
resolve()
} catch (err) {
reject(err)
logger.error(err)
}
})
}
If the product will be entered only then the update queries will be executed.
I found that another source code (Some react code in frontend) cause this issue. I'll close this issue.

problem when restarting tables and insert data in bigquery using node api

I have unexpected behaviour when loading data into BigQuery just after creating the schema.
I'm using Node API to insert data with BigQuery streaming API.
In order to reset the data I delete and create the tables before loading any data.
My Problem: the first time it works fine, but if I execute it again it fails.
The process always delete and creates the table schema, but does not insert the data until I wait a moment to execute it again.
This is the code which reproduces the case:
async function loadDataIntoBigquery() {
const {BigQuery} = require('#google-cloud/bigquery')
const tableName = "users"
const dataset = "data_analisis"
const schemaUsers = "name:string,date:string,type:string"
const userData = [{name: "John", date: "20/08/93", type: "reader"}, {
name: "Marie",
date: "20/08/90",
type: "owner"
}]
try {
const bigquery = new BigQuery()
await bigquery.createDataset(dataset).then(err => console.log("dataset created successfully")).catch(err => {
console.log("warn: maybe the dataset already exists")
})
await bigquery.dataset(dataset).table(tableName).delete().then(err => console.log("table deleted successfully")).catch((err) => {
console.log("Error: maybe the table does not exist")
})
await bigquery.dataset(dataset).createTable(tableName, {schema: schemaUsers}).then(() => console.log("table created successfully")).catch(err => console.log("Error: maybe the table already exists"))
await bigquery.dataset(dataset).table(tableName).insert(userData).then((data) => console.log("Ok inserted ", data)).catch(err => console.log("Error: can't insert "))
} catch (err) {
console.log("err", err)
}
}
to verify that the data was inserted I'm using this query
select * from `data_analisis.users`
I have the same issue. As a workaround, i insert data with a query instead :
const query = "INSERT INTO `"+dataset+"."+tableName"` (name, date, type ) VALUES ("+name+",'"+date+"','"+type+"')";
await bigQuery.query({
query: query,
useLegacySql: false,
location: 'EU'
}, (err) => {
console.log("Insertion error : ",err);
})

How to send bulk MongoDB count() queries?

My application has a search field and to do an autocomplete, I first fetch the distinct() values, and immediately after, I send a count() query for each distinct value. There can be dozens of values to then count, that's a lot of queries.
Any idea how I could avoid this large number of queries using MongoDB's NodeJS module?
For now, the query is as such:
const baseQuery = {
"organization": organization,
"status": "processed"
}
let domains = []
// A. Query to get the disinct values
MongoDB.getMainCollection().distinct(`location.hostname`, { organization, status: "processed" })
// B. Got the value, now creating a COUNT() query for each
.then(list => {
domains = list.map((host,idx) => Object.assign({}, { domain: host, count: 0, idx: idx }))
const countingPromises = list.map(host => MongoDB.getMainCollection().count(Object.assign({}, baseQuery, { "location.hostname": host })))
return Promise.all(countingPromises)
})
// C. Putting it all together
.then(values => {
values.forEach((count, idx) => {
const domain = domains.find(d => d.idx === idx)
if (domain) {
domain.count = count
}
})
domains.sort((a,b) => b.count - a.count)
resolve(domains)
})
.catch(err => reject(new AppError(`Error listing hostnames for #${organization}.`, 500, err, payload)))
p.s. this works as intended and returns what I want -- just want to avoid so many queries and perhaps bundle them if possible?
You can get all the distinct values and their counts in a single aggregate query:
MongoDB.getMainCollection().aggregate([
// Filter for the desired docs
{$match: baseQuery},
// Group the docs by location.hostname and get a count for each
{$group: {_id: '$location.hostname', count: {$sum: 1}}}
])

Interdependent Transactions with pg-promise

I am trying to build an app involves posts and tags for posts. For these I have a post, tags and post_tag table. tags has the tags I have defined before hand and in somewhere in the app is suggested to the user on the front-end. post_tag table holds the post and tag ids as pairs on each row.
I use express.js and postgreql and pg-promise.
As far as I know I need a transactional query(ies) for a create post operation.
Also I need a mechanism to detect if a tag was not in tags table when the user created the post, so that I can insert it on the fly, and I have a tag_id for each tag that is neccessary to use in insertion of the post_id and tag_id into post_tag table. Otherwise, I will have a foreign key error since I need to post_tag table's columns post_id and tag_id to reference posts and tags table id columns, respectively.
Here is the url function I use for this I have used so far unsuccessful:
privateAPIRoutes.post('/ask', function (req, res) {
console.log('/ask req.body: ', req.body);
// write to posts
var post_id = ''
var post_url = ''
db.query(
`
INSERT INTO
posts (title, text, post_url, author_id, post_type)
VALUES
($(title), $(text), $(post_url), $(author_id), $(post_type))
RETURNING id
`,
{
title: req.body.title,
text: req.body.text,
post_url: slug(req.body.title),
author_id: req.user.id,
post_type: 'question'
} // remember req.user contains decoded jwt saved by mw above.
)
.then(post => {
console.log('/ask post: ', post);
post_id = post.id
post_url = post.post_url
// if tag deos not exist create it here
var tags = req.body.tags;
console.log('2nd block tags1', tags);
for (var i = 0; i < tags.length; i++) {
if (tags[i].id == undefined) {
console.log('req.body.tags[i].id == undefined', tags[i].id);
var q1 = db.query("insert into tags (tag) values ($(tag)) returning id", {tag: tags[i].label})
.then(data => {
console.log('2nd block tags2', tags);
tags[i].id = data[0].id
// write to the post_tag
db.tx(t => {
var queries = [];
for (var j = 0; j < tags.length; j++) {
var query = t.query(
`
INSERT INTO
post_tag (post_id, tag_id)
VALUES
($(post_id), $(tag_id))
`,
{
post_id: post_id,
tag_id: tags[j].id
}
)
queries.push(query);
}
return t.batch(queries)
})
.then(data => {
res.json({post_id: post_id, post_url: post_url})
})
.catch(error => {
console.error(error);
})
})
.catch(error => {
console.error(error);
});
}
}
})
.catch(error => {
console.error(error);
})
});
The main problem you have - you can't use the root-level db object inside a task or transaction. Trying to create a new connection while inside a transaction breaks the transaction logic. You would need to use t.tx in such cases. However, in your case I don't see that you need it at all.
corrected code:
privateAPIRoutes.post('/ask', (req, res) => {
console.log('/ask req.body: ', req.body);
db.tx(t => {
return t.one(
`
INSERT INTO
posts (title, text, post_url, author_id, post_type)
VALUES
($(title), $(text), $(post_url), $(author_id), $(post_type))
RETURNING *
`,
{
title: req.body.title,
text: req.body.text,
post_url: slug(req.body.title),
author_id: req.user.id,
post_type: 'question'
} // remember req.user contains decoded jwt saved by mw above.
)
.then(post => {
console.log('/ask second query: post[0]: ', post);
console.log('/ask second query: tags: ', req.body.tags);
console.log('/ask second query: tags[0]: ', req.body.tags[0]);
// the key piece to the answer:
var tagIds = req.body.tags.map(tag => {
return tag.id || t.one("insert into tags(tag) values($1) returning id", tag.label, a=>a.id);
});
return t.batch(tagIds)
.then(ids => {
var queries = ids.map(id => {
return t.one(
`
INSERT INTO post_tag (post_id, tag_id)
VALUES ($(post_id), $(tag_id))
RETURNING post_id, tag_id
`,
{
post_id: post.id,
tag_id: id
}
)
});
return t.batch(queries);
});
});
})
.then(data => {
// data = result from the last query;
console.log('/api/ask', data);
res.json(data);
})
.catch(error => {
// error
});
});
The key here is simply to iterate through the tag id-s, and for the ones that are not set - use an insert. Then you settle them all by passing the array into t.batch.
Other recommendations:
You should use method one when executing an insert that returns the new record columns.
You should use try/catch only once there, on the transaction. This is relevant to how to use promises, and not just for this library
You can place your queries into external SQL files, see Query Files
To understand conditional inserts better, see SELECT->INSERT

Resources