problem when restarting tables and insert data in bigquery using node api - node.js

I have unexpected behaviour when loading data into BigQuery just after creating the schema.
I'm using Node API to insert data with BigQuery streaming API.
In order to reset the data I delete and create the tables before loading any data.
My Problem: the first time it works fine, but if I execute it again it fails.
The process always delete and creates the table schema, but does not insert the data until I wait a moment to execute it again.
This is the code which reproduces the case:
async function loadDataIntoBigquery() {
const {BigQuery} = require('#google-cloud/bigquery')
const tableName = "users"
const dataset = "data_analisis"
const schemaUsers = "name:string,date:string,type:string"
const userData = [{name: "John", date: "20/08/93", type: "reader"}, {
name: "Marie",
date: "20/08/90",
type: "owner"
}]
try {
const bigquery = new BigQuery()
await bigquery.createDataset(dataset).then(err => console.log("dataset created successfully")).catch(err => {
console.log("warn: maybe the dataset already exists")
})
await bigquery.dataset(dataset).table(tableName).delete().then(err => console.log("table deleted successfully")).catch((err) => {
console.log("Error: maybe the table does not exist")
})
await bigquery.dataset(dataset).createTable(tableName, {schema: schemaUsers}).then(() => console.log("table created successfully")).catch(err => console.log("Error: maybe the table already exists"))
await bigquery.dataset(dataset).table(tableName).insert(userData).then((data) => console.log("Ok inserted ", data)).catch(err => console.log("Error: can't insert "))
} catch (err) {
console.log("err", err)
}
}
to verify that the data was inserted I'm using this query
select * from `data_analisis.users`

I have the same issue. As a workaround, i insert data with a query instead :
const query = "INSERT INTO `"+dataset+"."+tableName"` (name, date, type ) VALUES ("+name+",'"+date+"','"+type+"')";
await bigQuery.query({
query: query,
useLegacySql: false,
location: 'EU'
}, (err) => {
console.log("Insertion error : ",err);
})

Related

Sequelize - How to Insert data, and then update

I'm using Sequalize ORM for node.js/maria db project.
What I'm trying to do is, generate new table for product data using raw query.
The sequence of my logic is written below.
Step 1. Destroy table to reset data.
Step 2. Insert product data.
Step 3. Update price data in product data.
Step 4. Update stock data in product data.
The problem is step 3, and 4. It is not working!
What I found is... 'Insert' took some time to finish. So, 'Update' could not fulfilled, because there's no product data yet.
Is there any idea to invoke step 3~4, soon after step 2 is finished?
Thanks in advance.
const generateProductList = () => {
return new Promise( async (resolve, reject) => {
try {
await ProductPresentation.destroy({ truncate: true })
const productSql = `INSERT INTO m_product_presentation (productId, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail)
SELECT id, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail FROM m_product;`
const priceSql = `UPDATE m_product_presentation INNER JOIN m_price
ON m_product_presentation.productId = m_price.productId
SET m_product_presentation.priceRrp = m_price.priceRrp, m_product_presentation.priceRegular = m_price.priceRegular, m_product_presentation.priceSpecial = m_price.priceSpecial;`
const stockSql = `UPDATE m_product_presentation INNER JOIN m_inventory
ON m_product_presentation.productId = m_inventory.productId
SET m_product_presentation.stockAvailability = m_inventory.stockAvailability, m_product_presentation.stockQty = m_inventory.stockQty;`
// What I want is, Create initial data first. And then, update price and stock info. But, It fail...
await ProductPresentation.sequelize.query(productSql, { type: QueryTypes.INSERT })
await ProductPresentation.sequelize.query(priceSql, { type: QueryTypes.UPDATE })
await ProductPresentation.sequelize.query(stockSql, { type: QueryTypes.UPDATE })
resolve()
} catch(err) {
reject(err)
logger.error(err)
}
})
}
What you can do is that before updating the values which is dependent on the same product, you can check whether the product is already entered or not.
const generateProductList = () => {
return new Promise(async (resolve, reject) => {
try {
await ProductPresentation.destroy({ truncate: true })
const productSql = `INSERT INTO m_product_presentation (productId, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail)
SELECT id, sku, name, status, urlKey, category, shortDescription, imageSmall, imageThumbnail FROM m_product;`
const priceSql = `UPDATE m_product_presentation INNER JOIN m_price
ON m_product_presentation.productId = m_price.productId
SET m_product_presentation.priceRrp = m_price.priceRrp, m_product_presentation.priceRegular = m_price.priceRegular, m_product_presentation.priceSpecial = m_price.priceSpecial;`
const stockSql = `UPDATE m_product_presentation INNER JOIN m_inventory
ON m_product_presentation.productId = m_inventory.productId
SET m_product_presentation.stockAvailability = m_inventory.stockAvailability, m_product_presentation.stockQty = m_inventory.stockQty;`
// Inserting the product in the table
const product = await ProductPresentation.sequelize.query(productSql, { type: QueryTypes.INSERT })
// To find if the product is existing the table or not
const isProductEntered = await sequelize.query(`
select exists(select 1 from "m_product_presentation" where "productId"=$1)
`, {
bind: [`${product[0][0].productId}`],
type: QueryTypes.SELECT
});
if (isProductEntered[0].exists) {
await ProductPresentation.sequelize.query(priceSql, { type: QueryTypes.UPDATE })
await ProductPresentation.sequelize.query(stockSql, { type: QueryTypes.UPDATE })
}
resolve()
} catch (err) {
reject(err)
logger.error(err)
}
})
}
If the product will be entered only then the update queries will be executed.
I found that another source code (Some react code in frontend) cause this issue. I'll close this issue.

show data after inserted using sequelize raw queries in express

I'm trying to send the inserted data with raw queries using sequelize then show it. Below is my code:
const c_product_post = async (req, res) => {
try {
const sql = `INSERT INTO products (p_name, p_price, p_stock, p_review, "createdAt", "updatedAt")
VALUES ('${req.body.product_name}', ${req.body.product_price}, ${req.body.product_stock}, ${req.body.product_review}, now(), now());`
const postData = await Product.sequelize.query(sql)
// await postData.save()
res.send({
message: "success add new product",
data: postData
})
}
catch (err) {
res.send({
message: err
})
}
}
what I'm trying to achieve is that after the data is inserted then it will be shown (see below image in red):
Add RETURNING clause to your query. Try this
INSERT INTO products (p_name, p_price, p_stock, p_review, "createdAt", "updatedAt")
VALUES ('${req.body.product_name}', ${req.body.product_price}, ${req.body.product_stock}, ${req.body.product_review}, now(), now())
RETURNING *;
Please note that your approach is highly SQLi prone. Consider using prepared statements instead of text substitution.

Only process 500 lines/row at a time createReadStream

I have to read a really large CSV file so search through the google and get to know about createReadStream. I am using a program that read the csv file data and insert it into the mongoDB.
process I am following
process the data using createReadStream (I think it read the file line by line).
Storing data into an array.
Insert the data into mongoDB using insertMany
Now the problem is whole file is first get stored into an array and then I insert into the database.
But what I think is the better approach would be I only store first 500 line/rows into an array insert it into the DB and again follow the same step for the next 500 records
Is it possible to achieve this ?
and also is it the right way to do this ?
my program
const test = async () => {
const stream = fs.createReadStream(workerData)
.pipe(parse())
.on('data', async function(csvrow) {
try{
stream.pause()
if(!authorName.includes(csvrow.author)) {
const author = new Author({author: csvrow.author})
authorId = author._id
authorName.push(author.author)
authorData.push(author)
}
if(!companyName.includes(csvrow.company_name)) {
const company = new Company({companyName: csvrow.company_name})
companyID = company._id
companyName.push(company.companyName)
companyData.push(company)
}
users = new User({
name: csvrow.firstname,
dob: csvrow.dob,
address: csvrow.address,
phone: csvrow.phone,
state: csvrow.state,
zip: csvrow.zip,
email: csvrow.email,
gender: csvrow.gender,
userType: csvrow.userType
})
userData.push(users)
book = new Book({
book_number: csvrow.book_number,
book_name: csvrow.book_name,
book_desc: csvrow.book_desc,
user_id: users._id,
author_id: authorId
})
bookData.push(book)
relationalData.push({
username: users.name,
author_id: authorId,
book_id: book._id,
company_id: companyID
})
}finally {
stream.resume()
}
})
.on('end', async function() {
try {
Author.insertMany(authorData)
User.insertMany(userData)
Book.insertMany(bookData)
Company.insertMany(companyData)
await Relational.insertMany(relationalData)
parentPort.postMessage("true")
}catch(e){
console.log(e)
parentPort.postMessage("false")
}
})
}
test()
This program is working fine also inserting the data into the DB, But I am looking for something like this:
const stream = fs.createReadStream(workerData)
.pipe(parse())
.on('data', async function(csvrow, maxLineToRead: 500) {
// whole code/logic of insert data into DB
})
so maxLineToRead is my imaginary term.
basically my point is I want to process 500 data at a time and insert it into the DB and want to repeat this process till the end.
You can create a higher scoped array variable where you accumulate rows of data as they arrive on the data event. When you get to 500 rows, fire off your database operation to insert them. If not yet at 500 rows, then just add the next one to the array and wait for more data events to come.
Then, in the end event insert any remaining rows still in the higher scoped array.
In this way, you will insert 500 at a time and then however many are left at the end. This has an advantage vs. inserting them all at the end that you spread out the database load over the time you are parsing.
Here's an attempt to implement that type of processing. There are some unknowns (documented with comments) based on an incomplete description of exactly what you're trying to accomplish in some circumstances):
const test = () => {
return new Promise((resolve, reject) => {
const accumulatedRows = [];
async function processRows(rows) {
// initialize data arrays that we will insert
const authorData = [],
companyData = [],
userData = [],
bookData = [],
relationalData = [];
// this code still has a problem that I don't have enough context
// to know how to solve
// If authorName contains csvrow.author, then the variable
// authorId is not initialized, but is used later in the code
// This is a problem that needs to be fixed.
// The same issue occurs for companyID
for (let csvrow of rows) {
let authorId, companyID;
if (!authorName.includes(csvrow.author)) {
const author = new Author({ author: csvrow.author })
authorId = author._id
authorName.push(author.author)
authorData.push(author)
}
if (!companyName.includes(csvrow.company_name)) {
const company = new Company({ companyName: csvrow.company_name })
companyID = company._id
companyName.push(company.companyName)
companyData.push(company)
}
let users = new User({
name: csvrow.firstname,
dob: csvrow.dob,
address: csvrow.address,
phone: csvrow.phone,
state: csvrow.state,
zip: csvrow.zip,
email: csvrow.email,
gender: csvrow.gender,
userType: csvrow.userType
});
userData.push(users)
let book = new Book({
book_number: csvrow.book_number,
book_name: csvrow.book_name,
book_desc: csvrow.book_desc,
user_id: users._id,
author_id: authorId
});
bookData.push(book)
relationalData.push({
username: users.name,
author_id: authorId,
book_id: book._id,
company_id: companyID
});
}
// all local arrays of data are populated now for this batch
// so add this data to the database
await Author.insertMany(authorData);
await User.insertMany(userData);
await Book.insertMany(bookData);
await Company.insertMany(companyData);
await Relational.insertMany(relationalData);
}
const batchSize = 50;
const stream = fs.createReadStream(workerData)
.pipe(parse())
.on('data', async function(csvrow) {
try {
accumulatedRows.push(csvRow);
if (accumulatedRows.length >= batchSize) {
stream.pause();
await processRows(accumulatedRows);
// clear out the rows we just processed
acculatedRows.length = 0;
stream.resume();
}
} catch (e) {
// calling destroy(e) will prevent leaking a stream
// and will trigger the error event to be called with that error
stream.destroy(e);
}
}).on('end', async function() {
try {
await processRows(accumulatedRows);
resolve();
} catch (e) {
reject(e);
}
}).on('error', (e) => {
reject(e);
});
});
}
test().then(() => {
parentPort.postMessage("true");
}).catch(err => {
console.log(err);
parentPort.postMessage("false");
});

POST the same UUID into two tables with Node / PostgreSQL

I am trying to make a POST request that will insert the same UUID value into two tables: 'employee' and 'skill'. I have tried this a few different ways, but have not been able to do so. Here is my query for posting the UUID (and a 'summary') into one table:
app.post("/employees/:id/skills", async(req, res) => {
try {
const { summary } = req.body;
const addEmployeeSkill = await pool.query(
"INSERT INTO skill(skill_uuid, summary)VALUES(uuid_generate_v4(), $1) RETURNING *",
[summary],
);
res.json(addEmployeeSkill.rows[0]);
} catch (err) {
console.error(err.message);
}
});
My question is: how do I get the same UUID that is being generated into the 'skill' table to also insert into the skill_uuid column of the 'employee' table?

Limiting amount of async BigQuery Jobs running on express server

I have an express server that is pulling data from Google BigQuery. An array of objects is provided. I want to pull sales data for each store in a district, but the table holds sales information based only on store and doesn't have district information. I was accomplishing this by sending one query per district, but once the array has more than 50 district I get errors. The results are stored in individual CSV files by district. So it is convenient to send individual queries and dump it into the CSVs. BigQuery only allows 50 jobs at a given time. I am looking for the best way to adapt the below code to call asyncQuery(query) 50 times then only make the next call when a previous call has returned. I have been trying to work the job status using the job.getMetadata() but no luck yet.
Thanks for any help you can offer
const array = [{
district: "north"
store: "1001,1002"
},
{
district: "south"
store: "1003"
},{
district: "west"
store: "1004"
}
]
function apiCall(array) {
array.forEach(element => {
let stores = element.store.toString()
let query = `SELECT store, sku, tot_sales, price
FROM big-query-table
WHERE
store IN (${stores})`
asyncQuery(query)
.then(resp => {
console.log(resp)
}).catch(err => {
console.error('ERROR:', err);
})
})
return "Running Jobs"
}
function asyncQuery(sqlQuery) {
const options = {
query: sqlQuery,
useLegacySql: false,
};
let job;
return bigquery
.createQueryJob(options)
.then(results => {
job = results[0];
console.log(`Job ${job.id} started.`);
return job.promise();
})
.then(() => {
// Get the job's status
return job.getMetadata();
})
.then(metadata => {
// Check the job's status for errors
const errors = metadata[0].status.errors;
if (errors && errors.length > 0) {
throw errors;
}
})
.then(() => {
console.log(`Job ${job.id} completed.`);
return job.getQueryResults();
})
.then(results => {
const rows = results[0];
return rows;
})
.catch(err => {
console.error('ERROR:', err);
});
}
With BigQuery - and any other columnar analytical database - you really want to avoid doing 50 queries like:
[*50] SELECT * FROM big-query-table
WHERE storeNumber = ${StoreNumber}
Instead, the best you could do is one query, specifying the columns you are looking for, and all the ids you're looking for:
SELECT col1, col2, col3
FROM big-query-table
WHERE storeNumber IN ('id1', 'id2', ..., 'id50')
Or a straight join:
SELECT col1, col2, col3
FROM big-query-table
WHERE storeNumber IN (SELECT store_id FROM `table`)
Then you won't need to send 50 concurrent queries, and you'll get result in less time and lower cost.

Resources