Parsing CSV with common data preceding other data with NodeJS - node.js

I'm trying to read in CSV files with nodejs and the code is like below.
fs.createReadStream(file)
.pipe(csv.parse({from_line: 6, columns: true, bom: true}, (err, data) => {
data.forEach((row, i) => {
As I am using from_line parameter, the data starts at line 6 with header.
The issue is that the line #3 has the date which is also used with other data.
What is the best way to resolve this?
Data file looks like below:
Genre: ABC
Date: 2020-01-01, 2020-12-31
Number of Data: 300
No., Code, Name, sales, delivery, return, stock
1, ......
2, ......
Additional question
I have inserted iconv.decodeStream in the second part of function.
How could I apply the same decoder for header read-in process?
fs.createReadStream(file)
.pipe(iconv.decodeStream("utf-8"))
.pipe(csv.parse({from_line: 6, columns: true, bom: true}, (err, data) => {
data.forEach((row, i) => {

I'd suggest reading the header data first, then you can access this data in your processing callback(s), something like the example below:
app.js
// Import the package main module
const csv = require('csv')
const fs = require("fs");
const { promisify } = require('util');
const parse = promisify(csv.parse);
const iconv = require('iconv-lite');
async function readHeaderData(file, iconv) {
let buffer = Buffer.alloc(1024);
const fd = fs.openSync(file)
fs.readSync(fd, buffer);
fs.closeSync(fd);
buffer = await iconv.decode(buffer, "utf-8");
const options = { to_line: 3, delimiter: ':', columns: false, bom: true, trim: true };
const rows = await parse(buffer, options);
// Convert array to object
return Object.fromEntries(rows);
}
async function readFile(file, iconv) {
const header = await readHeaderData(file, iconv);
console.log("readFile: File header:", header);
fs.createReadStream(file)
.pipe(iconv.decodeStream("utf-8"))
.pipe(csv.parse({ from_line: 6, columns: true, bom: true, trim: true }, (err, data) => {
// We now have access to the header data along with the row data in the callback.
data.forEach((row, i) => console.log( { line: i, header, row } ))
}));
}
readFile('stream-with-skip.csv', iconv)
This will give an output like:
readFile: File header: {
Genre: 'ABC',
Date: '2020-01-01, 2020-12-31',
'Number of Data': '300'
}
and
{
line: 0,
header: {
Genre: 'ABC',
Date: '2020-01-01, 2020-12-31',
'Number of Data': '300'
},
row: {
'No.': '1',
Code: 'Code1',
Name: 'Name1',
sales: 'sales1',
delivery: 'delivery1',
return: 'return1',
stock: 'stock1'
}
}
{
line: 1,
header: {
Genre: 'ABC',
Date: '2020-01-01, 2020-12-31',
'Number of Data': '300'
},
row: {
'No.': '2',
Code: 'Code2',
Name: 'Name2',
sales: 'sales2',
delivery: 'delivery2',
return: 'return2',
stock: 'stock2'
}
}
example.csv
Genre: ABC
Date: 2020-01-01, 2020-12-31
Number of Data: 300
No., Code, Name, sales, delivery, return, stock
1, Code1, Name1, sales1, delivery1, return1, stock1
2, Code2, Name2, sales2, delivery2, return2, stock2

Related

Insert into JSON objects postgres using Node without forloop for every db.query

I have an array of objects like with the same setup as the one below. I wanna insert this array of objects into postgres table that looks like this:[![Table setup][1]][1]
I have tried to make a function below but it returns error when inserting UNGDOMSKOLE because this is a string so it doesn't understand the space so it crashes on the second input value. How can I make it understand it is a string?
{
'#type': 'SensorSystem',
id: 'SN47230',
name: 'ÅKRA UNGDOMSSKOLE',
shortName: 'Åkra ',
country: 'Norge',
countryCode: 'NO',
geometry: {
'#type': 'Point',
coordinates: [ 5.1963, 59.2555 ],
nearest: false
},
masl: 18,
validFrom: '2013-10-29T00:00:00.000Z',
county: 'ROGALAND',
countyId: 11,
municipality: 'KARMØY',
municipalityId: 1149,
stationHolders: [ 'KARMØY KOMMUNE' ],
externalIds: [ '506131077' ],
wigosId: '0-578-0-47230'
}
Error code:
error: syntax error at or near "UNGDOMSSKOLE"
What I have tried so far:
let sqlinsert= data.data.map((source)=>{
if (source.geometry) {
if(!source.masl){
source.masl=0
}
let Point = `POINT(${source.geometry.coordinates[0]} ${source.geometry.coordinates[1]})`;
return `(${source.id}, ${source.name}, ${source.shortName},${source.country},${source.countryCode},${source.masl},${source.geometry.coordinates[0]},${source.geometry.coordinates[1]},${Point},${source.validFrom},${source.county},${source.countyId},${source.municipality},${source.municipalityId})`
}
})
const result = await db.query("INSERT INTO sources(source_id,name,shortName,country,countryCode,masl,long,lat,geog,valid_from,county,countyId,municipality,municipalityId) values"+sqlinsert[0])
A second problem I have with this is that inserting
POINT(59.2555 5.1963)
Gives a syntax error at 5.1963
[1]: https://i.stack.imgur.com/4RSkq.png
The main problem with your query as written is that you are adding raw, unescaped values into your VALUES records. You can use escapeLiteral on your db client to ensure that these values are properly escaped which will solve the syntax errors you are getting:
const data = [
{
"#type": "SensorSystem",
id: "SN47230",
name: "ÅKRA UNGDOMSSKOLE",
shortName: "Åkra ",
country: "Norge",
countryCode: "NO",
geometry: {
"#type": "Point",
coordinates: [5.1963, 59.2555],
nearest: false,
},
masl: 18,
validFrom: "2013-10-29T00:00:00.000Z",
county: "ROGALAND",
countyId: 11,
municipality: "KARMØY",
municipalityId: 1149,
stationHolders: ["KARMØY KOMMUNE"],
externalIds: ["506131077"],
wigosId: "0-578-0-47230",
},
].map((source) => {
const {
id,
name,
shortName,
country,
countryCode,
masl,
geometry: {
// the coordinates in your source data appear to be in y,x instead of
// x,y. Treating them as x,y results in the point being located
// in the Indian Ocean while y,x is somewhere in Norway.
coordinates: [lat, long],
},
validFrom,
county,
countyId,
municipality,
municipalityId,
} = source;
return [
id,
name,
shortName,
country,
countryCode,
masl || 0,
long,
lat,
`POINT( ${long} ${lat} )`,
validFrom,
county,
countyId,
municipality,
municipalityId,
];
});
const headers = [
"source_id",
"name",
"shortname",
"country",
"countrycode",
"masl",
"long",
"lat",
"geog",
"valid_from",
"county",
"countyid",
"municipality",
"municipalityid",
];
const sourceValStr = data
.map((sourceRecords, rowIndex) => {
return sourceRecords
.map((value, colIndex) => {
if (typeof value === "string") {
// safely escape string values
return dbClient.escapeLiteral(value);
}
if (
typeof value === "number" ||
typeof value === "boolean" ||
typeof value === "bigint"
) {
return value;
}
if (value === undefined || value === null) {
return "null";
}
throw new Error(
`non-simple value: ${JSON.stringify(value)} for ${
headers[colIndex]
} at row ${rowIndex}`
);
})
.join(",");
})
.map((value) => `(${value})`)
.join(",");
const sourceInsert = `INSERT INTO sources(${headers.join(
","
)}) VALUES ${sourceValStr};`;
await dbClient.query(sourceInsert);
A much more efficient and scalable way to insert the rows is to use the pg-copy-streams library in conjunction with a CSV library like csv-stringify which will bulk insert using a COPY FROM stream:
import { from as copyFrom } from "pg-copy-streams";
import { stringify } from "csv-stringify";
// ...
const copyStmt = `COPY sources(${headers.join(
","
)}) FROM STDIN (FORMAT CSV)`;
await new Promise<void>((resolve, reject) => {
const copyStream = dbClient.query(copyFrom(copyStmt));
const stringifiedStream = stringify(data, {
header: false,
encoding: "utf-8",
delimiter: ",",
quote: "\"",
});
stringifiedStream
.on("error", (err) => {
reject(err);
})
.on("end", () => resolve());
stringifiedStream.pipe(copyStream);
});
On my low-end laptop, this approach takes about 39 seconds to insert a million rows with no database optimizations.

Trying to simulate cell level TTL in bigtable but whole column family data is getting removed by garbage collection

created a table with the following rules:
so with this, data should expire after 1 second (as per docs)
async function createTable() {
console.log("Creating Table");
const options = {
families: [
{
name: 'payloads',
rule: {
age: {
seconds: 1,
nanos: 0,
},
},
},
],
};
try {
await table.create(options);
console.log("Successfully Created Table");
} catch (err) {
console.error(`Error reading rows :`, err);
}
}
And then inserted the data like this:
const rowsToInsert = {
key: "SUMEET",
data: {
payloads: {
'1': {
value: "NOTIFICATIONS_PAYLOAD_1",
timestamp: 1576500927000,
},
'2': {
value: "NOTIFICATIONS_PAYLOAD_2",
timestamp: 1576587327000,
},
'3': {
value: "NOTIFICATIONS_PAYLOAD_3",
timestamp: 1576673727000,
},
'4': {
value: "NOTIFICATIONS_PAYLOAD_4",
timestamp: 1576760127000,
},
},
},
};
await table.insert(rowsToInsert);
so i added four cells with different timeStamp:
First with 5 minutes ahead of time when am writing the data
Second with 1 hour ahead
third with 1 day ahead
fourth with 2 day ahead
problem here is when m reading the data whole column family data is getting deleted but it should delete only first and second cell only as per the rules set
Is there anything am missing or am doing wrong ??
The issue is in your timestamp, you are most likely setting dates from the past. I would recommend you to set the dates with the date methods from javascript instead of manually setting them up like you are doing right now.
I did some tests with following code:
const Bigtable = require('#google-cloud/bigtable');
const bigtable = Bigtable();
const instance = bigtable.instance([instance]);
const table = instance.table([table]);
const now = new Date();
async function writeSimple() {
var now = new Date(Date.now());
var fiveMinutes = new Date(now.getTime() + 5 * 60000);
var anHour = new Date(now.getTime() + 60 * 60000);
var aDay = new Date(now.getTime() + 24 * 60 * 60000);
var twoDays = new Date(now.getTime() + 48 * 60 * 60000);
const rowsToInsert = {
key: "SUMEET",
data: {
payloads: {
'1': {
value: "NOTIFICATIONS_PAYLOAD_1",
timestamp: now,
},
'2': {
value: "NOTIFICATIONS_PAYLOAD_2",
timestamp: fiveMinutes,
},
'3': {
value: "NOTIFICATIONS_PAYLOAD_3",
timestamp: anHour,
},
'4': {
value: "NOTIFICATIONS_PAYLOAD_4",
timestamp: aDay,
},
'5': {
value: "NOTIFICATIONS_PAYLOAD_5",
timestamp: twoDays,
},
},
},
};
await table.insert(rowsToInsert);
console.log(`Successfully wrote row ${rowsToInsert.key}`);
}
and obtained a row like the following one:
2019/12/17 16:53:33 -creds flag unset, will use gcloud credential
----------------------------------------
SUMEET
payloads:1 # 2019/12/17-16:30:34.343000
"NOTIFICATIONS_PAYLOAD_1"
payloads:2 # 2019/12/17-16:35:34.343000
"NOTIFICATIONS_PAYLOAD_2"
payloads:3 # 2019/12/17-17:30:34.343000
"NOTIFICATIONS_PAYLOAD_3"
payloads:4 # 2019/12/18-16:30:34.343000
"NOTIFICATIONS_PAYLOAD_4"
payloads:5 # 2019/12/19-16:30:34.343000
"NOTIFICATIONS_PAYLOAD_5"
And after the garbage collector passed (around 15 minutes later) I got the result that you desired:
2019/12/17 16:59:47 -creds flag unset, will use gcloud credential
----------------------------------------
SUMEET
payloads:3 # 2019/12/17-17:30:34.343000
"NOTIFICATIONS_PAYLOAD_3"
payloads:4 # 2019/12/18-16:30:34.343000
"NOTIFICATIONS_PAYLOAD_4"
payloads:5 # 2019/12/19-16:30:34.343000
"NOTIFICATIONS_PAYLOAD_5"
I hope you find this useful!
You have the timestamp in milliseconds. Cloud Bigtable requires microseconds (see here). Basically, add '000' to the end of all of your timestamps.

Insert Unicode text file data into database

I need a solution for getting data out of a text file that is unicode encoded and write this data into an existing database. I am working with node.js
My text file looks like this (whitespaces are shown)
Text file
Column Active Description Number
1 True test test 0000
1 True test1 test1 0001
1 True test2 test2 0002
1 True test3 test3 0003
1 True test4 test4 0004
I need to get the values of column, active, description and number for each row in order to create a new entry in my database with the respective values. My database already exists and has the columns Column, Active, Description and Number. What is the eaisest way to write the data of the text file into my database. I tried a csv parser
const csv = require('csv-streamify');
const fs = require('fs');
const parser = csv({
delimiter: '\t',
columns: true,
});
parser.on('data', (line) => {
console.log(line)
})
fs.createReadStream('test.txt').pipe(parser)
but that only showed me output like this:
{ '��C\u0000o\u0000l\u0000u\u0000m\u0000n ...
How do i get the correct output and what do i have to do to write the data into the database?
I think it has to be something like this:
connection.query('INSERT INTO Table1 SET ...
I really don't know how to continue.
Using 'byline' instead of 'csv-streamify', you can retrieve the content as follows:
const byline = require('byline');
const fs = require('fs');
var stream = byline(fs.createReadStream('test.txt'));
var index = 0;
var headers;
var data = [];
stream.on('data', function(line) {
var currentData;
var entry;
var i;
line = line.toString(); // Convert the buffer stream to a string line
if (index === 0) {
headers = line.split(/[ ]+/);
} else {
currentData = line.split(/[ ]+/);
entry = {};
for (i = 0; i < headers.length; i++) {
entry[headers[i]] = currentData[i];
}
data.push(entry);
}
index++;
});
stream.on("error", function(err) {
console.log(err);
});
stream.on("end", function() {
console.log(data);
console.log("Done");
});
Everything is stored in the 'data' array as follows:
[ { Column: '1',
Active: 'True',
Description: 'test',
Number: 'test' },
{ Column: '1',
Active: 'True',
Description: 'test1',
Number: 'test1' },
{ Column: '1',
Active: 'True',
Description: 'test2',
Number: 'test2' },
{ Column: '1',
Active: 'True',
Description: 'test3',
Number: 'test3' },
{ Column: '1',
Active: 'True',
Description: 'test4',
Number: 'test4' } ]
Then, it shouldn't be too difficult to write to the SQL database.

How to write the parameter to update a particular property's value in an object from an array?

How to update quantity value based on title in the movies array and Item id (123)
I only manage to update value at the first layer like name (David), but don't know how to update the second layer with additional filter for the array (movies).
From:
Item:
{
id: 123,
name: 'David',
movies: [
{
id: 1,
title: 'The lord of the ring',
quantity: 1
},
{
id: 2,
title: 'Star Wars',
quantity: 1
}
]
}
To:
Item:
{
id: 123,
Name: 'David',
movies: [
{
id: 1,
title: 'The lord of the ring',
quantity: 2
},
{
id: 2,
title: 'Star Wars',
quantity: 1
}
]
}
By the way, I'm using aws DynamoDB document client in node.js, it will be nice if you can share me how you do it in your update parameter.
There is no way to update an object inside of a list without replacing it.
You probably want to restructure your table to emulate a relational data model. AWS has some documentation on this.
As an example, create your table like this:
aws dynamodb create-table \
--table-name movie-table \
--attribute-definitions AttributeName=rId,AttributeType=N AttributeName=rKey,AttributeType=S \
--key-schema AttributeName=rId,KeyType=HASH AttributeName=rKey,KeyType=RANGE
The table will have generically named hash and range keys. This script demonstrates how to structure the data and add to the "count":
const { DynamoDB } = require('aws-sdk');
const client = new DynamoDB.DocumentClient({ region: 'us-east-1' });
const addItem = (rId, rKey, attributes) => {
const item = { rId, rKey };
Object.assign(item, attributes);
return client.put({ TableName: 'movie-table', Item: item }).promise();
};
// NOTE: this is where the count attribute gets iterated
const addToCount = (rId, rKey) => client.update({
TableName: 'movie-table',
Key: { rId, rKey },
UpdateExpression: 'ADD #count :n',
ExpressionAttributeNames: { '#count': 'count' },
ExpressionAttributeValues: { ':n': 1 },
}).promise();
const run = async () => {
await addItem(123, 'USER|123', { name: 'David' });
await addItem(1, 'MOVIE|1', { title: 'The lord of the ring' });
await addItem(2, 'MOVIE|2', { title: 'Star Wars' });
await addItem(123, 'COUNT|1', { count: 1 });
await addItem(123, 'COUNT|2', { count: 1 });
await addToCount(123, 'COUNT|1');
};
run();
This is what the table looks like after the script runs:
I know this is a bit old but there is a way. Using the document client SDK, you can reference object properties and array elements in the UpdateExpression. However, you can't run any logic so you have to know/assume/expect that the element indexes are enough.
For example, you can do something like this:
let params = {
TableName: 'your-table-name',
Key: { id: 123 },
UpdateExpression: 'set movies[0].quantity = :x',
ExpressionAttributeValues: { ':x': 5 }
};
const client = AWS.DynamoDB.DocumentClient();
client.update(params);
NOTE: You cannot make the index an Expression Attribute Value. You would have to dynamically build that update expression based on the index you know has to be updated. It's not a perfect solution but it could get the job done.
For reference, I derived this from the base (non-DocumentClient) example from here: Adding Nested Map Attributes

Q promises and mongo db q.all

I'm making calls to a mongodb database - pulling data out... reading it, and then making further requests based on that data. Once all the data has been received, I wish to process it.
I've been using Q.promises library but don't know what I'm doing. I thought that q.all would only trigger once everything has completed? However, my processPlaylist function runs twice. I've commented the code below:
Thanks,
Rob
var PlaylistCollection = require('./models/playlist');
var AssetCollection = require('./models/asset');
var screenID = '############';
var playerData = [];
// array continaing playlistys which have been synced
var alreadySynced = [];
// Process our playlist once downloaded
var processPlaylist = function (playerData) {
console.log('----Processing Playerlist-----')
console.log(playerData);
// DO STUFF
}
// Get playlist by id. Return playlist Data
var getSubLists = function (id) {
return PlaylistCollection.findById(id);
}
// Get sub-playlist function
function getSubListRecursive(id) {
return getSubLists(id).then(function (playlist) {
// store all our returned playlist data into a playlist array
playerData.push(playlist)
// an Array to keep tabs on what we've already pulled down
alreadySynced.push(playlist.id)
// get all our playlist.resources, and only return those which are unique
var playlistResources = _.uniq(playlist.resources, 'rid');
// console.log('Playlist Resources: ', playlistResources)
// console.log(alreadySynced);
var sublists = _.pluck(_.filter(playlistResources, { 'type': 'playlist' }), 'rid');
// remove playlists which have already been synced. We don't want to pull them down twice
sublists = _.difference(sublists, alreadySynced);
// console.log('sublists: ', sublists)
// Get the next playlist and so on...
var dbops = sublists.map(function (sublist) {
// console.log(sublist)
return getSubListRecursive(sublist)
});
q.all(dbops).then(function () {
console.log('All Done - so process the playlist')
return processPlaylist(playerData);
});
})
}
// Trigger the whole process..... grab our first playlist / ScreenID
getSubListRecursive(screenID);
and I get the following output:
----Processing Playerlist-----
[ { _id: 554d1df16ce4c438f8e2225b,
title: 'list 1',
__v: 29,
daily: true,
endTime: '',
startTime: '',
resources:
[ { rid: '55650cebef204ab70302a4d9',
title: 'list 4',
type: 'playlist' },
{ rid: '554d1df16ce4c438f8e2225b',
title: 'list 1',
type: 'playlist' } ] },
{ _id: 55650cebef204ab70302a4d9,
title: 'list 4',
__v: 1,
daily: false,
endTime: '',
startTime: '',
resources:
[ { rid: '55650647ef204ab70302a4d8',
title: 'list 3',
type: 'playlist' } ] } ]
All Done - so process the playlist
----Processing Playerlist-----
[ { _id: 554d1df16ce4c438f8e2225b,
title: 'list 1',
__v: 29,
daily: true,
endTime: '',
startTime: '',
resources:
[ { rid: '55650cebef204ab70302a4d9',
title: 'list 4',
type: 'playlist' },
{ rid: '554d1df16ce4c438f8e2225b',
title: 'list 1',
type: 'playlist' } ] },
{ _id: 55650cebef204ab70302a4d9,
title: 'list 4',
__v: 1,
daily: false,
endTime: '',
startTime: '',
resources:
[ { rid: '55650647ef204ab70302a4d8',
title: 'list 3',
type: 'playlist' } ] },
{ _id: 55650647ef204ab70302a4d8,
title: 'list 3',
__v: 5,
daily: false,
endTime: '',
startTime: '',
resources:
[ { rid: '55650637ef204ab70302a4d7',
title: 'list 2',
type: 'playlist' },
{ rid: '554d1df16ce4c438f8e2225b',
title: 'list 1',
type: 'playlist' },
{ rid: '55650cebef204ab70302a4d9',
title: 'list 4',
type: 'playlist' } ] } ]
EDIT
There were a number of things wrong with what I wrote. I discussed it with a buddy of mine - who pointed out that getSubListRecursive is being invoked recursively several times so the q.all statement is being executed several times...
So I refactored...
// Get sub-playlist function
function getSubListRecursive(id) {
console.log(id)
return getSubLists(id).then(function (playlist) {
if (playlist) {
// store all our returned playlist data into a playlist array
playerData.push(playlist)
// an Array to keep tabs on what we've already pulled down
alreadySynced.push(playlist.id)
// get all our playlist.resources, and only return those which are unique
var playlistResources = _.uniq(playlist.resources, 'rid');
// console.log('Playlist Resources: ', playlistResources)
// console.log(alreadySynced);
var sublists = _.pluck(_.filter(playlistResources, { 'type': 'playlist' }), 'rid');
// remove playlists which have already been synced. We don't want to pull them down twice
sublists = _.difference(sublists, alreadySynced);
// console.log('sublists: ', sublists)
return sublists.map(function (sublist) {
// console.log(sublist)
if (sublists.length > 0) {
return getSubListRecursive(sublist)
} else {
return processPlaylist(playerData);
}
});
} else {
return processPlaylist(playerData);
}
});
}
this works. I'm basically using promises to control the flow here - which probably isn't the best way of doing it? I no longer use an all statement, and ultimately end up with array populated with all the playlist data - which I can manipulate in my processPlaylist function.
However, I've not marked the question as solved, as I'd really like to know how I can do this with Q.all (properly use promises)
Thanks,
Rob
I think you were just confused about when the entire process was finished. You need to wait until the entire recursive promise chain has resolved. I think you could use the original code with a slight change to where processPlaylist() is called:
var PlaylistCollection = require('./models/playlist');
var AssetCollection = require('./models/asset');
var screenID = '############';
var playerData = [];
// array continaing playlistys which have been synced
var alreadySynced = [];
// Process our playlist once downloaded
var processPlaylist = function (playerData) {
console.log('----Processing Playerlist-----')
console.log(playerData);
// DO STUFF
}
// Get playlist by id. Return playlist Data
var getSubLists = function (id) {
return PlaylistCollection.findById(id);
}
// Get sub-playlist function
function getSubListRecursive(id) {
return getSubLists(id).then(function (playlist) {
// store all our returned playlist data into a playlist array
playerData.push(playlist)
// an Array to keep tabs on what we've already pulled down
alreadySynced.push(playlist.id)
// get all our playlist.resources, and only return those which are unique
var playlistResources = _.uniq(playlist.resources, 'rid');
// console.log('Playlist Resources: ', playlistResources)
// console.log(alreadySynced);
var sublists = _.pluck(_.filter(playlistResources, { 'type': 'playlist' }), 'rid');
// remove playlists which have already been synced. We don't want to pull them down twice
sublists = _.difference(sublists, alreadySynced);
// console.log('sublists: ', sublists)
// Get the next playlist and so on...
var dbops = sublists.map(function (sublist) {
// console.log(sublist)
return getSubListRecursive(sublist)
});
return q.all(dbops);
});
}
// Trigger the whole process..... grab our first playlist / ScreenID
getSubListRecursive(screenID).then(function() {
console.log('All Done - so process the playlist')
return processPlaylist(playerData);
});

Resources