Save CVS from the web to Apify Dataset - node.js

I am trying to get some CVS data from google sheet and store it into an Apify dataset.
const Apify = require('apify');
const request = require('request-promise');
Apify.main(async () => {
var URL = "https://docs.google.com/spreadsheets/d/1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk/gviz/tq?tqx=out:csv";
const html = await request(URL);
console.log('My output:');
console.log(html);
await Apify.setValue('OUTPUT', html);
const namedDataset = await Apify.openDataset();
await namedDataset.pushData(html);
});
Here is error message:
2020-01-01T16:43:21.501Z My output:
2020-01-01T16:43:21.510Z "city","country"
2020-01-01T16:43:21.512Z "Berlin ","Germany"
2020-01-01T16:43:21.513Z "Los Angeles","United States"
2020-01-01T16:43:21.514Z "Melbourne","Australia"
2020-01-01T16:43:21.516Z "Sydney","Australia"
2020-01-01T16:43:21.517Z "London","United Kingdom"
2020-01-01T16:43:21.519Z "New York City","United States"
2020-01-01T16:43:21.614Z ERROR: The function passed to Apify.main() threw an exception: (error details: type=invalid-parameter)
2020-01-01T16:43:21.616Z ApifyClientError: Parameter "data" of type Array | Object must be provided
2020-01-01T16:43:21.617Z at exports.checkParamOrThrow (/usr/src/app/node_modules/apify-client/build/utils.js:222:15)
2020-01-01T16:43:21.619Z at Dataset.pushData (/usr/src/app/node_modules/apify/build/dataset.js:222:34)
2020-01-01T16:43:21.620Z at Apify.main (/usr/src/app/main.js:16:22)
2020-01-01T16:43:21.621Z at process._tickCallback (internal/process/next_tick.js:68:7)

A more elegant solution would be using our Google Sheets actor.
const Apify = require('apify');
Apify.main(async () => {
const spreadsheetId = '1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk';
const sheetsActorInput = {
mode: 'read',
spreadsheetId,
};
const data = await Apify.call('lukaskrivka/google-sheets', sheetsActorInput);
const namedDataset = await Apify.openDataset('my-dataset');
await namedDataset.pushData(data);
});
The only disadvantage (also an advantage is some sense) is that you need to authorize in your first run but that is really simple.

I was able to use this somewhat hacky approach. I am sure their is a more modern elgagent approach:
const Apify = require('apify');
const request = require('request-promise');
function csvJSON(csv) { //https://stackoverflow.com/a/27979069/2330272
var lines = csv.split("\n");
var result = [];
// NOTE: If your columns contain commas in their values, you'll need
// to deal with those before doing the next step
// (you might convert them to &&& or something, then covert them back later)
// jsfiddle showing the issue https://jsfiddle.net/
var headers = lines[0].split(",");
for (var i = 1; i < lines.length; i++) {
var obj = {};
var currentline = lines[i].split(",");
for (var j = 0; j < headers.length; j++) {
obj[headers[j]] = currentline[j];
}
result.push(obj);
}
return JSON.stringify(result); //JSON
}
Apify.main(async () => {
var URL = "https://docs.google.com/spreadsheets/d/1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk/gviz/tq?tqx=out:csv"; //test
const html = await request(URL);
const urls = csvJSON(html.replace(/\"/g, "")); // remove quotes from csv data
console.log('My output:');
const namedDataset = await Apify.openDataset();
await namedDataset.pushData(JSON.parse(urls));
});

Related

Puppeteer to convert html to pdf using Nodejs in Durable functions(fan out fan in)

I'm working on a small project to convert a large xml to several formatted pdf documents. The large xml contains multiple similar format xmls. So I'm using a single html template for printing all the documents. After producing all the pdf documents I also need to produce a metadata file with some basic info on each document that was printed.
I thought using the fan out fan in scenario of durable functions is a perfect for my use case. I'm working with Nodejs. I setup all my code and it seems to be working fine locally. The Orchestration function looks like the below.
const df = require("durable-functions");
module.exports = df.orchestrator(function* (context) {
var xmldata = yield context.df.callActivity("DurablegetblobJS1","");
var tasks = [];
for (file of xmldata) {
tasks.push(context.df.callActivity("Durableactivityjs2", file));
}
const outputs = yield context.df.Task.all(tasks);
var finalout = "";
for (out of outputs){
console.log('I am done1 :' + out );
finalout = finalout + out;
}
return finalout;
});
DurablegetblobJS1 : Fetches the entire xmls and splits it into multiple smaller xmls(1 per document).
Durableactivityjs2 : Fetches the html template, extracts the different values from the individual xmls and applies them to the html and finally prints out the pdf into an azure storage. It returns the name of the pdf document that was printed for creation of the metadata file. The code for this is below.
var fs = require('fs');
var xml2js = require('xml2js');
var html_to_pdf = require('html-pdf-node');
var parser = new xml2js.Parser();
module.exports = async function (context) {
//console.log("Hello from activity :")
var xmldict = {}
var xmltext = context.bindings.name;
//Extract the nodes and attributes
metadata(xmltext,xmldict);
report(xmltext,xmldict);
context.log(xmldict)
const { BlobServiceClient } = require("#azure/storage-blob");
// Load the .env file if it exists
require("dotenv").config();
const AZURE_STORAGE_CONNECTION_STRING = process.env.STORAGE_CONNECTION_STRING || "";
const blobServiceClient = BlobServiceClient.fromConnectionString(
AZURE_STORAGE_CONNECTION_STRING
);
var containerClient = blobServiceClient.getContainerClient('test');
var blobname = 'comb_template.html';
var blockBlobClient = containerClient.getBlockBlobClient(blobname);
var downloadBlockBlobResponse = await blockBlobClient.download(0);
var html_template = await streamToText(downloadBlockBlobResponse.readableStreamBody);
let options = { format: 'A4'};
let file = { content: html_template};
const x = await writepdf1(file, options,blobServiceClient,xmldict);
console.log("Written Blob PDF");
return x;
};
async function writepdf1(file, options,blobServiceClient,xmldict){
const pdfBuffer = await html_to_pdf.generatePdf(file, options);
const containerClient = blobServiceClient.getContainerClient('test2');
const targetblob = xmldict['OU'] + '/' + xmldict['ReportName'] + '/' + xmldict['OU'] + '_' + xmldict['ReportName'] + '_' + xmldict['DocumentID'] + '_' + '.pdf';
console.log('Blob name :' + targetblob);
const blockBlobClient_t = containerClient.getBlockBlobClient(targetblob);
const uploadBlobResponse = await blockBlobClient_t.upload(pdfBuffer, pdfBuffer.length);
return targetblob;
}
async function streamToText(readable) {
readable.setEncoding('utf8');
let data = '';
for await (const chunk of readable) {
data += chunk;
}
return data;
}
function metadata(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test1 = result['HPDPSMsg']['DocumentRequest'][0]['MetaData'][0];
Object.entries(test1).forEach(([key, value]) => {
xmldict[key] = value[0];
});
});
}
function report(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test2 = result['HPDPSMsg']['DocumentRequest'][0]['Report'][0]['$'];
Object.entries(test2).forEach(([key, value]) => {
xmldict[key] = value;
});
});
}
However, when I deploy the entire project into a azure premium function(EP1 - Windows), I see some errors in app insights when I try and execute my function and the pdfs are never generated.
Activity function 'Durableactivityjs2' failed: Could not find browser
revision 818858. Run "PUPPETEER_PRODUCT=firefox npm install" or
"PUPPETEER_PRODUCT=firefox yarn install" to download a supported
Firefox browser binary
I'm a bit clueless how I'm supposed to resolve this. Any help or suggestions would be appreciated.

Facing issue with scooping in node JS

This code is showing empty object ( {} )
// declared at top
let mainData = {};
let trainStations = {};
let routes = {};
let trainNo = {};
data["data"].forEach(async (element) => {
const response2 = await fetch(
`https://india-rail.herokuapp.com/trains/getRoute?trainNo=${element["train_base"]["train_no"]}`
);
const data2 = await response2.json();
data2["data"].forEach((ele) => {
routes[ele["source_stn_code"]] = true;
});
trainNo[element["train_base"]["train_no"]] = routes;
});
console.log(trainNo);
if i do this then i will give response with data
data["data"].forEach(async (element) => {
const response2 = await fetch(
`https://india-rail.herokuapp.com/trains/getRoute?trainNo=${element["train_base"]["train_no"]}`
);
const data2 = await response2.json();
data2["data"].forEach((ele) => {
routes[ele["source_stn_code"]] = true;
});
trainNo[element["train_base"]["train_no"]] = routes;
console.log(trainNo);
});
maybe there is some scooping issue please kindly help me to solve this problem :)
Please refer here and also check this.
As a short note, using await inside a forEach() loop will give unexpected results. This is because the forEach() does not wait until the promise to settled (either fulfilled or rejected).
A simple solution for this could be using either the traditional for loop or the for..of loop.
for(let element of data["data"]){
const response2 = await fetch(
`https://india-rail.herokuapp.com/trains/getRoute?trainNo=${element["train_base"]["train_no"]}`
);
const data2 = await response2.json();
data2["data"].forEach((ele) => {
routes[ele["source_stn_code"]] = true;
});
trainNo[element["train_base"]["train_no"]] = routes;
}
console.log(trainNo);
NOTE: Make sure to wrap the above for..of loop inside an async function because the await keyword is allowed inside a function only when the function is defined with async keyword.

nodeJS async function parse csv return data to other file

I'm creating a small tool for internal user with puppeteer.
Basically I got a csv file with some data i "read" and fill form with.
As I try to cleanup my project to be reusable i'm struggle a little bit:
I create a file name parsecsv.js
const config = require('../config.json');
const parse = require('csv-parse');
const fs = require('fs');
const processFile = async () => {
records = []
const parser = fs
.createReadStream(config.sourceFile)
.pipe(parse({
// CSV options
from_line: 1,
delimiter: ";",
}));
let i =1;
for await (const record of parser) {
records.push(record)
i++;
}
return records
}
const processFileData = async () => {
const records = await processFile()
console.info(records);
return records
}
module.exports ={
processFile, processFileData
}
in an other Js file i made
const parseCSV = require('./src/ParseCsv');
const records = parseCSV.processFileData();
const data = parseCSV.processFile();
console.log(typeof records);
console.table(records);
console.log(typeof data);
console.table(data);
But I never get my data only an empty oject.
How I can get my data to be able to "share" it with other function ?
thanks
as your functions are async ones and they return a promises, you can do something like
const parseCSV = require('./src/ParseCsv');
(async () => {
const records = await parseCSV.processFileData();
const data = await parseCSV.processFile();
console.log(typeof records);
console.table(records);
console.log(typeof data);
console.table(data);
})()

All my scraped text ends up in one big object instead of separate objects with Cheerio

I'm following a web scraping course that uses Cheerio. I practice on a different website then they use in the course and now I run into the problem that all my scraped text end up in one big object. But every title should end up in it's own object. Can someone see what I did wrong? I already bumbed my head 2 hours on this problem.
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.aanbod").each((index, element) => {
const result = $(element).children(".item");
const title = result.find("h2").text().trim();
const characteristics = result.find("h4").text();
const scrapeResult = {title, characteristics};
scrapeResults.push(scrapeResult);
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();
This is the link to the repo: https://github.com/danielkroon/huurgoed-scraper/blob/master/index.js
Thanks!
That is because of the way you used selectors. I've modified your script to fetch the content as you expected. Currently the script is collecting titles and characteristics. Feel free to add the rest within your script.
This is how you can get the required output:
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.item").each((index, element) => {
const title = $(element).find(".kenmerken > h2").text().trim();
const characteristics = $(element).find("h4").text().trim();
scrapeResults.push({title,characteristics});
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();

How to make the function wait till I get the response in node.js

Async and Await are not working as expected. Please correct me where I am doing wrong in code.
I am reading data (url, pagelimit, company)from excel and by using switch(), I am navigating to the service.
I have to wait till I get the response from this function cnbservice.GetcnbOpenings(url, pageLimit,company), store the response to global array and call this function mdsservice.GetMdsOpenings(url, pageLimit,company), append the results to the global array.
const readexcel = async (request, response) => {
const workbook = XLSX.readFile('file.xlsx');
const sheetnamelist = workbook.SheetNames;
var xldata = XLSX.utils.sheet_to_json(workbook.Sheets[sheetnamelist[0]]);
dataarray =[];
for (i = 0; i < xldata.length; i++) {
company = xldata[i].company;
url = xldata[i].careers_link_url;
pageLimit = xldata[i].pagelimit;
switch(company){
case process.env.cnb_company_name:
const arr = await cnbservice.GetcnbOpenings(url, pageLimit,company)
if(arr !== undefined){
dataarray.push(arr);
}
break;
case process.env.mds_company_name:
const arr1 = await mdsservice.GetMdsOpenings(url, pageLimit,company)
if(arr1 !== undefined){
dataarray.push(arr1);
}
break;
case "default":
console.log("Company Name not matching with any of the services")
}
}
}
You are running await code inside standard for loop which will not work synchronously. to run async/await inside a for loop you should use for...of loop.
for(let element of array){
//await call
}
after making following changes your code will work as expected.
const readexcel = async (request, response) => {
const workbook = XLSX.readFile('file.xlsx');
const sheetnamelist = workbook.SheetNames;
var xldata = XLSX.utils.sheet_to_json(workbook.Sheets[sheetnamelist[0]]);
dataarray = [];
for (let element of xldata) {
company = element.company;
url = element.careers_link_url;
pageLimit = element.pagelimit;
switch (company) {
case process.env.cnb_company_name:
const arr = await cnbservice.GetcnbOpenings(url, pageLimit, company)
if (arr !== undefined) {
dataarray.push(arr);
}
break;
case process.env.mds_company_name:
const arr1 = await mdsservice.GetMdsOpenings(url, pageLimit, company)
if (arr1 !== undefined) {
dataarray.push(arr1);
}
break;
case "default":
console.log("Company Name not matching with any of the services")
}
}
}

Resources