Puppeteer to convert html to pdf using Nodejs in Durable functions(fan out fan in) - node.js

I'm working on a small project to convert a large xml to several formatted pdf documents. The large xml contains multiple similar format xmls. So I'm using a single html template for printing all the documents. After producing all the pdf documents I also need to produce a metadata file with some basic info on each document that was printed.
I thought using the fan out fan in scenario of durable functions is a perfect for my use case. I'm working with Nodejs. I setup all my code and it seems to be working fine locally. The Orchestration function looks like the below.
const df = require("durable-functions");
module.exports = df.orchestrator(function* (context) {
var xmldata = yield context.df.callActivity("DurablegetblobJS1","");
var tasks = [];
for (file of xmldata) {
tasks.push(context.df.callActivity("Durableactivityjs2", file));
}
const outputs = yield context.df.Task.all(tasks);
var finalout = "";
for (out of outputs){
console.log('I am done1 :' + out );
finalout = finalout + out;
}
return finalout;
});
DurablegetblobJS1 : Fetches the entire xmls and splits it into multiple smaller xmls(1 per document).
Durableactivityjs2 : Fetches the html template, extracts the different values from the individual xmls and applies them to the html and finally prints out the pdf into an azure storage. It returns the name of the pdf document that was printed for creation of the metadata file. The code for this is below.
var fs = require('fs');
var xml2js = require('xml2js');
var html_to_pdf = require('html-pdf-node');
var parser = new xml2js.Parser();
module.exports = async function (context) {
//console.log("Hello from activity :")
var xmldict = {}
var xmltext = context.bindings.name;
//Extract the nodes and attributes
metadata(xmltext,xmldict);
report(xmltext,xmldict);
context.log(xmldict)
const { BlobServiceClient } = require("#azure/storage-blob");
// Load the .env file if it exists
require("dotenv").config();
const AZURE_STORAGE_CONNECTION_STRING = process.env.STORAGE_CONNECTION_STRING || "";
const blobServiceClient = BlobServiceClient.fromConnectionString(
AZURE_STORAGE_CONNECTION_STRING
);
var containerClient = blobServiceClient.getContainerClient('test');
var blobname = 'comb_template.html';
var blockBlobClient = containerClient.getBlockBlobClient(blobname);
var downloadBlockBlobResponse = await blockBlobClient.download(0);
var html_template = await streamToText(downloadBlockBlobResponse.readableStreamBody);
let options = { format: 'A4'};
let file = { content: html_template};
const x = await writepdf1(file, options,blobServiceClient,xmldict);
console.log("Written Blob PDF");
return x;
};
async function writepdf1(file, options,blobServiceClient,xmldict){
const pdfBuffer = await html_to_pdf.generatePdf(file, options);
const containerClient = blobServiceClient.getContainerClient('test2');
const targetblob = xmldict['OU'] + '/' + xmldict['ReportName'] + '/' + xmldict['OU'] + '_' + xmldict['ReportName'] + '_' + xmldict['DocumentID'] + '_' + '.pdf';
console.log('Blob name :' + targetblob);
const blockBlobClient_t = containerClient.getBlockBlobClient(targetblob);
const uploadBlobResponse = await blockBlobClient_t.upload(pdfBuffer, pdfBuffer.length);
return targetblob;
}
async function streamToText(readable) {
readable.setEncoding('utf8');
let data = '';
for await (const chunk of readable) {
data += chunk;
}
return data;
}
function metadata(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test1 = result['HPDPSMsg']['DocumentRequest'][0]['MetaData'][0];
Object.entries(test1).forEach(([key, value]) => {
xmldict[key] = value[0];
});
});
}
function report(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test2 = result['HPDPSMsg']['DocumentRequest'][0]['Report'][0]['$'];
Object.entries(test2).forEach(([key, value]) => {
xmldict[key] = value;
});
});
}
However, when I deploy the entire project into a azure premium function(EP1 - Windows), I see some errors in app insights when I try and execute my function and the pdfs are never generated.
Activity function 'Durableactivityjs2' failed: Could not find browser
revision 818858. Run "PUPPETEER_PRODUCT=firefox npm install" or
"PUPPETEER_PRODUCT=firefox yarn install" to download a supported
Firefox browser binary
I'm a bit clueless how I'm supposed to resolve this. Any help or suggestions would be appreciated.

Related

Reading multiple files from google bucket & loading into BQ using nodejs

When i try to read files from google bucket and load data in bigquery table, google bucket throws me timeout error. is there a way to read files synchronously and load to bigquery table.
This one works for when files are less, and tried using then as well which also gives same error.
const { BigQuery } = require('#google-cloud/bigquery');
const { Storage } = require('#google-cloud/storage');
var fs = require("fs");
const bucketName = 'bucketname';
const gcpProject = "projectname";
const datasetprojectname = "bqprojectname";
const datasetId = "dsname";
const tableId = "tablename";
exports.helloworld = async (req, res) => {
const bigquery = new BigQuery({ projectId: datasetprojectname });
const storage = new Storage(gcpProject);
const loaddatabq = new Storage(gcpProject);
const bucket = storage.bucket(bucketName);
const fileoptions = {
prefix: "singlefile"
};
var filecount = 0;
var errcount = 0;
var filemoveerrcount = 0;
const [getfilename] = await bucket.getFiles(fileoptions);
var filenamespring = "";
var getjson = null;
getfilename.forEach(async files => {
try {
filecount++;
var filename = files.name;
if (filename != "singlefile/") {
var contents = await files.download(files.name);
await bigquery.dataset(datasetId).table(tableId).insert(JSON.parse(contents).body);
}
}
catch (err) {
}
});
};
If your file are in JSONL (1 JSON document per line, JSON Line), you can use the load job to achieve this.
You can filter on the file that you want by using wildcard character. It will be more efficient than a for loop.
This solution is also cheaper. You are limited to 1500 loads per table and per day, but the load is free. In your current code you use the streaming API, and you pay for it ($0.05 per Gb)

nodeJS async function parse csv return data to other file

I'm creating a small tool for internal user with puppeteer.
Basically I got a csv file with some data i "read" and fill form with.
As I try to cleanup my project to be reusable i'm struggle a little bit:
I create a file name parsecsv.js
const config = require('../config.json');
const parse = require('csv-parse');
const fs = require('fs');
const processFile = async () => {
records = []
const parser = fs
.createReadStream(config.sourceFile)
.pipe(parse({
// CSV options
from_line: 1,
delimiter: ";",
}));
let i =1;
for await (const record of parser) {
records.push(record)
i++;
}
return records
}
const processFileData = async () => {
const records = await processFile()
console.info(records);
return records
}
module.exports ={
processFile, processFileData
}
in an other Js file i made
const parseCSV = require('./src/ParseCsv');
const records = parseCSV.processFileData();
const data = parseCSV.processFile();
console.log(typeof records);
console.table(records);
console.log(typeof data);
console.table(data);
But I never get my data only an empty oject.
How I can get my data to be able to "share" it with other function ?
thanks
as your functions are async ones and they return a promises, you can do something like
const parseCSV = require('./src/ParseCsv');
(async () => {
const records = await parseCSV.processFileData();
const data = await parseCSV.processFile();
console.log(typeof records);
console.table(records);
console.log(typeof data);
console.table(data);
})()

Save CVS from the web to Apify Dataset

I am trying to get some CVS data from google sheet and store it into an Apify dataset.
const Apify = require('apify');
const request = require('request-promise');
Apify.main(async () => {
var URL = "https://docs.google.com/spreadsheets/d/1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk/gviz/tq?tqx=out:csv";
const html = await request(URL);
console.log('My output:');
console.log(html);
await Apify.setValue('OUTPUT', html);
const namedDataset = await Apify.openDataset();
await namedDataset.pushData(html);
});
Here is error message:
2020-01-01T16:43:21.501Z My output:
2020-01-01T16:43:21.510Z "city","country"
2020-01-01T16:43:21.512Z "Berlin ","Germany"
2020-01-01T16:43:21.513Z "Los Angeles","United States"
2020-01-01T16:43:21.514Z "Melbourne","Australia"
2020-01-01T16:43:21.516Z "Sydney","Australia"
2020-01-01T16:43:21.517Z "London","United Kingdom"
2020-01-01T16:43:21.519Z "New York City","United States"
2020-01-01T16:43:21.614Z ERROR: The function passed to Apify.main() threw an exception: (error details: type=invalid-parameter)
2020-01-01T16:43:21.616Z ApifyClientError: Parameter "data" of type Array | Object must be provided
2020-01-01T16:43:21.617Z at exports.checkParamOrThrow (/usr/src/app/node_modules/apify-client/build/utils.js:222:15)
2020-01-01T16:43:21.619Z at Dataset.pushData (/usr/src/app/node_modules/apify/build/dataset.js:222:34)
2020-01-01T16:43:21.620Z at Apify.main (/usr/src/app/main.js:16:22)
2020-01-01T16:43:21.621Z at process._tickCallback (internal/process/next_tick.js:68:7)
A more elegant solution would be using our Google Sheets actor.
const Apify = require('apify');
Apify.main(async () => {
const spreadsheetId = '1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk';
const sheetsActorInput = {
mode: 'read',
spreadsheetId,
};
const data = await Apify.call('lukaskrivka/google-sheets', sheetsActorInput);
const namedDataset = await Apify.openDataset('my-dataset');
await namedDataset.pushData(data);
});
The only disadvantage (also an advantage is some sense) is that you need to authorize in your first run but that is really simple.
I was able to use this somewhat hacky approach. I am sure their is a more modern elgagent approach:
const Apify = require('apify');
const request = require('request-promise');
function csvJSON(csv) { //https://stackoverflow.com/a/27979069/2330272
var lines = csv.split("\n");
var result = [];
// NOTE: If your columns contain commas in their values, you'll need
// to deal with those before doing the next step
// (you might convert them to &&& or something, then covert them back later)
// jsfiddle showing the issue https://jsfiddle.net/
var headers = lines[0].split(",");
for (var i = 1; i < lines.length; i++) {
var obj = {};
var currentline = lines[i].split(",");
for (var j = 0; j < headers.length; j++) {
obj[headers[j]] = currentline[j];
}
result.push(obj);
}
return JSON.stringify(result); //JSON
}
Apify.main(async () => {
var URL = "https://docs.google.com/spreadsheets/d/1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk/gviz/tq?tqx=out:csv"; //test
const html = await request(URL);
const urls = csvJSON(html.replace(/\"/g, "")); // remove quotes from csv data
console.log('My output:');
const namedDataset = await Apify.openDataset();
await namedDataset.pushData(JSON.parse(urls));
});

Google Vision | Vietnamese: Low Quality OCR Results

Background
Using Google Vision API (with Node) to recognize Vietnamese text, the result is lacking quality. There are some (not all but some) tone markers as well as vowel signifies missing.
Compared to their online demo, which returns a decent result (scroll down for live demo):
https://cloud.google.com/vision/
(As I do not have a company account with them, I cannot ask Google directly.)
Question
Can I tweak my request to get better results?
I already set the language hint to "vi" and tried to combine it with "en". I also tried the more specific "vi-VN".
Example Image
https://www.tecc.org/Slatwall/custom/assets/images/product/default/cache/j056vt-_800w_800h_sb.jpg
Example Code
const fs = require("fs");
const path = require("path");
const vision = require("#google-cloud/vision");
async function quickstart() {
let text;
const fileName = "j056vt-_800w_800h_sb.jpg";
const imageFile = fs.readFileSync(fileName);
const image = Buffer.from(imageFile).toString("base64");
const client = new vision.ImageAnnotatorClient();
const request = {
image: {
content: image
},
imageContext: {
languageHints: ["vi", 'en']
}
};
const [result] = await client.textDetection(request);
for (const tmp of result.textAnnotations) {
text += tmp.description + '\n';
}
const out = path.basename(fileName, path.extname(fileName)) + ".txt";
fs.writeFileSync(out, text);
}
quickstart();
Solution
// $env:GOOGLE_APPLICATION_CREDENTIALS="[PATH]"
const fs = require("fs");
const path = require("path");
const vision = require("#google-cloud/vision");
async function quickstart() {
let text = '';
const fileName = "j056vt-_800w_800h_sb.jpg";
const imageFile = fs.readFileSync(fileName);
const image = Buffer.from(imageFile).toString("base64");
const client = new vision.ImageAnnotatorClient();
const request = {
image: {
content: image
},
imageContext: {
languageHints: ["vi-VN"]
}
};
const [result] = await client.documentTextDetection(request);
// OUTPUT METHOD A
for (const tmp of result.textAnnotations) {
text += tmp.description + "\n";
}
console.log(text);
const out = path.basename(fileName, path.extname(fileName)) + ".txt";
fs.writeFileSync(out, text);
// OUTPUT METHOD B
const fullTextAnnotation = result.fullTextAnnotation;
console.log(`Full text: ${fullTextAnnotation.text}`);
fullTextAnnotation.pages.forEach(page => {
page.blocks.forEach(block => {
console.log(`Block confidence: ${block.confidence}`);
block.paragraphs.forEach(paragraph => {
console.log(`Paragraph confidence: ${paragraph.confidence}`);
paragraph.words.forEach(word => {
const wordText = word.symbols.map(s => s.text).join("");
console.log(`Word text: ${wordText}`);
console.log(`Word confidence: ${word.confidence}`);
word.symbols.forEach(symbol => {
console.log(`Symbol text: ${symbol.text}`);
console.log(`Symbol confidence: ${symbol.confidence}`);
});
});
});
});
});
}
quickstart();
This question is already answered in this one.
In summary, the Demo is in this case probably using the DOCUMENT_TEXT_DETECTION, which can sometimes make a more thorough strings extraction, while you are using TEXT_DETECTION.
You can try to make a client.document_text_detection request instead of client.textDetection and you will probably get results closer to the Demo.
If you want to read to the related documentation you can find it here.
I hope this resolves your question!

Firebase Database Get All Value In Order Cloud Functions

I develop for Firebase Cloud Functions. I have a Firebase Realtime Database like this:
----- myData
-------eqewrwrepere (this one is a device token)
---------Lta+sde-fer (this one is a firebase id)
firstvalue : "a"
secondvalue : "b"
----------Qrgd+ad-qdda (this one is second firebase id)
firstvalue : "c"
secondvalue : "d"
-------eqwerSAsdqe (this one is another device token)
---------Lta+sde-fer (this one is a firebase id)
firstvalue : "x"
secondvalue : "y"
----------Qrgd+ad-qdda (this one is second firebase id)
firstvalue : "z"
secondvalue : "t"
I fetch these data by this code. With this code i fetch all data and put them an array. And when fetching done, i loop this array for finding items. I am an iOS developer, so i am a newbie for NodeJS. Here is what i want to do:
Get firstvalue for each database data.
Make a api request with firstvalue of each database data.
Api returns an image.
Write image temp directory.
Process this image for visionApi.
Extract text.
Update database.
Send notification for deviceToken
Now i am able to retrieve database items in my array. When i make a request in for loop, request called async. So for loop continues, but request response or writing file and vision processing executed only once.
In for loop, get databasearray[0], make request, write file, process it with vision api, update database and go for next databasearray[1] item.
I read about Promises on different pages. But i did not understand.
Thank you.
'use strict';
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp(functions.config().firebase);
var request = require('request');
var fs = require('fs');
//var fs = require("fs");
// Get a reference to the Cloud Vision API component
const Vision = require('#google-cloud/vision');
const vision = new Vision.ImageAnnotatorClient();
// Imports the Google Cloud client library
//const {Storage} = require('#google-cloud/storage');
var fs = require("fs");
var os = require("os");
var databaseArray = [];
exports.hourly_job = functions.pubsub
.topic('hourly-job')
.onPublish((event) => {
console.log("Hourly Job");
var db = admin.database();
var ref = db.ref("myData")
ref.once("value").then(function(allData) {
allData.forEach(function(deviceToken) {
deviceToken.forEach(function(firebaseIDs) {
var deviceTokenVar = deviceToken.key;
var firebaseIDVar = firebaseIDs.key;
var firstvalue = firebaseIDs.child("firstvalue").val();
var secondvalue = firebaseIDs.child("secondvalue").val();
var items = [deviceTokenVar, firebaseIDVar, firstvalue, secondvalue];
databaseArray.push([...items]);
});
});
return databaseArray;
}).then(function(databasem) {
var i;
for (i = 0; i < databaseArray.length; i++) {
var databaseArrayDeviceToken = databaseArray[i][0];
console.log("DeviceToken: " + databaseArrayDeviceToken);
var databaseArrayFirebaseID = databaseArray[i][1];
console.log("FirebaseID: " + databaseArrayFirebaseID);
var databaseArrayfirstvalue = databaseArray[i][2];
console.log("firstval: " + databaseArrayfirstvalue);
var databaseArraysecondval = databaseArray[i][3];
console.log("Second: " + databaseArraysecondval);
var url = "http://api.blabla" + databaseArrayfirstvalue;
/////////////here make a request, pause loop, process returned image, but how //////////////////////
request.get({
url: url,
encoding: 'binary'
}, function(error, httpResponse, body) {
if (!error && httpResponse.statusCode == 200) {
fs.writeFileSync('/tmp/processed.jpg', body, 'binary')
console.log("file written");
})
}
});
return true;
});
I found solution with Mocas helps. Here is the solution. I use async/await functions in code. Now for loop waits for the function response. But now I have different problems. I think main async function hangs because of awaits. And then next hourly trigger, it runs again. So console log shows 15-16-17 or more ‘i’ values in for loop. I have 4 element in database array but console log shows more than this every hour. And it increases every time. So I guess that I should cancel this await functions after a timeout. But I don’t know how. Here is code:
use strict';
const functions = require('firebase-functions');
const admin = require('firebase-admin');
admin.initializeApp(functions.config().firebase);
var request = require('request-promise').defaults({ encoding: null });
var fs = require('fs');
// Get a reference to the Cloud Vision API component
const Vision = require('#google-cloud/vision');
const vision = new Vision.ImageAnnotatorClient();
var os = require("os");
var databaseArray = [];
var uniqueFilename = require('unique-filename')
exports.hourly_job = functions.pubsub
.topic('hourly-job')
.onPublish((event) => {
console.log("Hourly Job");
var db = admin.database();
var ref = db.ref("myData")
ref.once("value").then(function(allData) {
allData.forEach(function(deviceToken) {
deviceToken.forEach(function(firebaseIDs) {
var deviceTokenVar = deviceToken.key;
var firebaseIDVar = firebaseIDs.key;
var firstvalue = firebaseIDs.child("firstvalue").val();
var secondvalue = firebaseIDs.child("secondvalue").val();
var items = [deviceTokenVar, firebaseIDVar, firstvalue, secondvalue];
databaseArray.push([...items]);
//console.log(databaseArray);
//return true;
});
//return true;
});
return databaseArray;
}).then(function (databasem) {
main().catch(console.error);
});
return true;
});
const main = async () => {
var i;
for (i = 0; i < databaseArray.length; i++) {
console.log("Database Arrays " + i + ". elements: ");
var databaseArrayDeviceToken = databaseArray[i][0];
console.log("DeviceToken: " + databaseArrayDeviceToken);
var databaseArrayFirebaseID = databaseArray[i][1];
console.log("FirebaseID: " + databaseArrayFirebaseID);
var databaseArrayfirst = databaseArray[i][2];
console.log("first: " + databaseArrayfirst);
var databaseArraysecond = databaseArray[i][3];
console.log("second: " + databaseArraysecond);
if (databaseArrayfirst != "") {
var apiUrl = "http://api.blabla;
try {
const apiBody = await request.get(apiUrl);
///////////////////////////vison start//////////////////////
const visionResponseBody = await vision.documentTextDetection(apiBody)
var visionResponse = visionResponseBody[0].textAnnotations[0].description;
console.log("Vision response text " + visionResponse );
...some logic here about response...
/////////////////////////////////////////////////
var getdatabasevar = await admin.database().ref("myData/" + databaseArrayDeviceToken + "/" + databaseArrayFirebaseID);
await getdatabasevar.update({
"firstvalue": visionResponse
});
/////////////////////////////////////////////////
var getanotgerdatabasevar = await admin.database().ref("myData/" + databaseArrayDeviceToken + "/" + databaseArrayFirebaseID + "/" + "secondvalue");
await getanotgerdatabasevar.once("value")
.then(function(var) {
..some logic..
//send notification
});
} catch (error) {
console.error(error);
}
///////////////////////////vison end//////////////////////
}
};
return true;
};

Resources