Reading multiple files from google bucket & loading into BQ using nodejs - node.js

When i try to read files from google bucket and load data in bigquery table, google bucket throws me timeout error. is there a way to read files synchronously and load to bigquery table.
This one works for when files are less, and tried using then as well which also gives same error.
const { BigQuery } = require('#google-cloud/bigquery');
const { Storage } = require('#google-cloud/storage');
var fs = require("fs");
const bucketName = 'bucketname';
const gcpProject = "projectname";
const datasetprojectname = "bqprojectname";
const datasetId = "dsname";
const tableId = "tablename";
exports.helloworld = async (req, res) => {
const bigquery = new BigQuery({ projectId: datasetprojectname });
const storage = new Storage(gcpProject);
const loaddatabq = new Storage(gcpProject);
const bucket = storage.bucket(bucketName);
const fileoptions = {
prefix: "singlefile"
};
var filecount = 0;
var errcount = 0;
var filemoveerrcount = 0;
const [getfilename] = await bucket.getFiles(fileoptions);
var filenamespring = "";
var getjson = null;
getfilename.forEach(async files => {
try {
filecount++;
var filename = files.name;
if (filename != "singlefile/") {
var contents = await files.download(files.name);
await bigquery.dataset(datasetId).table(tableId).insert(JSON.parse(contents).body);
}
}
catch (err) {
}
});
};

If your file are in JSONL (1 JSON document per line, JSON Line), you can use the load job to achieve this.
You can filter on the file that you want by using wildcard character. It will be more efficient than a for loop.
This solution is also cheaper. You are limited to 1500 loads per table and per day, but the load is free. In your current code you use the streaming API, and you pay for it ($0.05 per Gb)

Related

Puppeteer to convert html to pdf using Nodejs in Durable functions(fan out fan in)

I'm working on a small project to convert a large xml to several formatted pdf documents. The large xml contains multiple similar format xmls. So I'm using a single html template for printing all the documents. After producing all the pdf documents I also need to produce a metadata file with some basic info on each document that was printed.
I thought using the fan out fan in scenario of durable functions is a perfect for my use case. I'm working with Nodejs. I setup all my code and it seems to be working fine locally. The Orchestration function looks like the below.
const df = require("durable-functions");
module.exports = df.orchestrator(function* (context) {
var xmldata = yield context.df.callActivity("DurablegetblobJS1","");
var tasks = [];
for (file of xmldata) {
tasks.push(context.df.callActivity("Durableactivityjs2", file));
}
const outputs = yield context.df.Task.all(tasks);
var finalout = "";
for (out of outputs){
console.log('I am done1 :' + out );
finalout = finalout + out;
}
return finalout;
});
DurablegetblobJS1 : Fetches the entire xmls and splits it into multiple smaller xmls(1 per document).
Durableactivityjs2 : Fetches the html template, extracts the different values from the individual xmls and applies them to the html and finally prints out the pdf into an azure storage. It returns the name of the pdf document that was printed for creation of the metadata file. The code for this is below.
var fs = require('fs');
var xml2js = require('xml2js');
var html_to_pdf = require('html-pdf-node');
var parser = new xml2js.Parser();
module.exports = async function (context) {
//console.log("Hello from activity :")
var xmldict = {}
var xmltext = context.bindings.name;
//Extract the nodes and attributes
metadata(xmltext,xmldict);
report(xmltext,xmldict);
context.log(xmldict)
const { BlobServiceClient } = require("#azure/storage-blob");
// Load the .env file if it exists
require("dotenv").config();
const AZURE_STORAGE_CONNECTION_STRING = process.env.STORAGE_CONNECTION_STRING || "";
const blobServiceClient = BlobServiceClient.fromConnectionString(
AZURE_STORAGE_CONNECTION_STRING
);
var containerClient = blobServiceClient.getContainerClient('test');
var blobname = 'comb_template.html';
var blockBlobClient = containerClient.getBlockBlobClient(blobname);
var downloadBlockBlobResponse = await blockBlobClient.download(0);
var html_template = await streamToText(downloadBlockBlobResponse.readableStreamBody);
let options = { format: 'A4'};
let file = { content: html_template};
const x = await writepdf1(file, options,blobServiceClient,xmldict);
console.log("Written Blob PDF");
return x;
};
async function writepdf1(file, options,blobServiceClient,xmldict){
const pdfBuffer = await html_to_pdf.generatePdf(file, options);
const containerClient = blobServiceClient.getContainerClient('test2');
const targetblob = xmldict['OU'] + '/' + xmldict['ReportName'] + '/' + xmldict['OU'] + '_' + xmldict['ReportName'] + '_' + xmldict['DocumentID'] + '_' + '.pdf';
console.log('Blob name :' + targetblob);
const blockBlobClient_t = containerClient.getBlockBlobClient(targetblob);
const uploadBlobResponse = await blockBlobClient_t.upload(pdfBuffer, pdfBuffer.length);
return targetblob;
}
async function streamToText(readable) {
readable.setEncoding('utf8');
let data = '';
for await (const chunk of readable) {
data += chunk;
}
return data;
}
function metadata(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test1 = result['HPDPSMsg']['DocumentRequest'][0]['MetaData'][0];
Object.entries(test1).forEach(([key, value]) => {
xmldict[key] = value[0];
});
});
}
function report(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test2 = result['HPDPSMsg']['DocumentRequest'][0]['Report'][0]['$'];
Object.entries(test2).forEach(([key, value]) => {
xmldict[key] = value;
});
});
}
However, when I deploy the entire project into a azure premium function(EP1 - Windows), I see some errors in app insights when I try and execute my function and the pdfs are never generated.
Activity function 'Durableactivityjs2' failed: Could not find browser
revision 818858. Run "PUPPETEER_PRODUCT=firefox npm install" or
"PUPPETEER_PRODUCT=firefox yarn install" to download a supported
Firefox browser binary
I'm a bit clueless how I'm supposed to resolve this. Any help or suggestions would be appreciated.

List azure file shares snapshots with node.js

Is there any way to list in node.js the list of snapshots that a file share has?
Example code:
const { ShareServiceClient, StorageSharedKeyCredential } = require("#azure/storage-file-share");
const credential = new StorageSharedKeyCredential(AZURE_STORAGE_ACCOUNT,AZURE_STORAGE_ACCESS_KEY);
const shareServiceClient = new ShareServiceClient(AZURE_STORAGE_CONNECTION_STRING,credential);
var shareName = "xxxxx";
var shareClient = shareServiceClient.getShareClient(shareName);
// Create a snapshot:
await shareClient.createSnapshot();
How to list the snapshots that this shareName has?
As such there's no special method to list snapshots for a file share. You will need to call listShares method of ShareServiceClient
(#azure/storage-file-share version 12.5.0) with includeSnapshots parameter as true and prefix as the share name.
Here's the sample code to do so (untested code):
const shareName = 'share-name';
const listingOptions = {
prefix: shareName,
includeSnapshots: true
};
shareServiceClient.listShares(listingOptions).byPage().next()
.then((result) => {
const shareItems = result.value.shareItems;
//Filter results where name of the share is same as share name and is a snapshot
const shareSnapshots = shareItems.filter(s => s.name === shareName && s.snapshot && s.snapshot !== '');
console.log(shareSnapshots);
})
.catch((error) => {
console.log(error);
})

nodeJS async function parse csv return data to other file

I'm creating a small tool for internal user with puppeteer.
Basically I got a csv file with some data i "read" and fill form with.
As I try to cleanup my project to be reusable i'm struggle a little bit:
I create a file name parsecsv.js
const config = require('../config.json');
const parse = require('csv-parse');
const fs = require('fs');
const processFile = async () => {
records = []
const parser = fs
.createReadStream(config.sourceFile)
.pipe(parse({
// CSV options
from_line: 1,
delimiter: ";",
}));
let i =1;
for await (const record of parser) {
records.push(record)
i++;
}
return records
}
const processFileData = async () => {
const records = await processFile()
console.info(records);
return records
}
module.exports ={
processFile, processFileData
}
in an other Js file i made
const parseCSV = require('./src/ParseCsv');
const records = parseCSV.processFileData();
const data = parseCSV.processFile();
console.log(typeof records);
console.table(records);
console.log(typeof data);
console.table(data);
But I never get my data only an empty oject.
How I can get my data to be able to "share" it with other function ?
thanks
as your functions are async ones and they return a promises, you can do something like
const parseCSV = require('./src/ParseCsv');
(async () => {
const records = await parseCSV.processFileData();
const data = await parseCSV.processFile();
console.log(typeof records);
console.table(records);
console.log(typeof data);
console.table(data);
})()

Split pdf into multiple pages preferably into its pages and save the various files in a folder using node js

Is it possible to split a pdf file into the number of pages it has and save these files in a folder using node js?
Using pdf-lib this should be fairly simple. This code should help you get started, it still needs some error-handling of course:
const fs = require('fs');
const PDFDocument = require('pdf-lib').PDFDocument;
async function splitPdf(pathToPdf) {
const docmentAsBytes = await fs.promises.readFile(pathToPdf);
// Load your PDFDocument
const pdfDoc = await PDFDocument.load(docmentAsBytes)
const numberOfPages = pdfDoc.getPages().length;
for (let i = 0; i < numberOfPages; i++) {
// Create a new "sub" document
const subDocument = await PDFDocument.create();
// copy the page at current index
const [copiedPage] = await subDocument.copyPages(pdfDoc, [i])
subDocument.addPage(copiedPage);
const pdfBytes = await subDocument.save()
await writePdfBytesToFile(`file-${i + 1}.pdf`, pdfBytes);
}
}
function writePdfBytesToFile(fileName, pdfBytes) {
return fs.promises.writeFile(fileName, pdfBytes);
}
(async () => {
await splitPdf("./path-to-your-file.pdf");
})();

Reading a csv file async - NodeJS

I am trying to create a function where I can pass file path and the read the file in async way. What I found out was that it supports streams()
const fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
readCSVData = async (filePath): Promise<any> => {
let csvString = '';
var parser = parse({delimiter: ','}, function (err, data) {
async.eachSeries(data, function (line, callback) {
csvString = csvString + line.join(',')+'\n';
console.log(csvString) // I can see this value getting populated
})
});
fs.createReadStream(filePath).pipe(parser);
}
I got this code from here. but I am new to node js so I am not getting how to use await to get the data once all lines are parsed.
const csvData = await this.util.readCSVData(path)
My best workaround for this task is:
const csv = require('csvtojson')
const csvFilePath = 'data.csv'
const array = await csv().fromFile(csvFilePath);
This answer provides legacy code that uses async library. Promise-based control flow with async doesn't need this library. Asynchronous processing with async.eachSeries doesn't serve a good purpose inside csv-parse callback because a callback waits for data to be filled with all collected data.
If reading all data into memory is not an issue, CSV stream can be converted to a promise:
const fs = require('fs');
const getStream = require('get-stream');
const parse = require('csv-parse');
readCSVData = async (filePath): Promise<any> => {
const parseStream = parse({delimiter: ','});
const data = await getStream.array(fs.createReadStream(filePath).pipe(parseStream));
return data.map(line => line.join(',')).join('\n');
}

Resources