screenshotting urls with node - async - node.js

Here is my code:
const fs = require('fs');
const screenshot = require('screenshot-stream');
const urlp = require('url');
var urls=[
'https://archive.org/details/8bitrecs',
'http://hackaday.com/',
'http://techcrunch.com/2012/02/16/auraslate-is-an-open-source-android-tablet-for-hackers/',
'http://www.english.illinois.edu/-people-/faculty/debaron/482/482readings/greenfield.html',
'http://sustain.rca.ac.uk/Sustain-Talks'];
urls.forEach(function(url){
const stream = screenshot(url, '1024x768', {crop: true});
stream.pipe(fs.createWriteStream(urlp.parse(url).hostname + 'test-1024x768.png'));
});
It only screenshots the last item in the url. Rhe others are images with zero bytes. I think I need to do the operation asynchronously so it doesn't overwrite the stream each time.
How would I do this?
UPDATE:
I want the screenshot to work, but catch errors and not block if a url is not accessible
UPDATE:
https://www.npmjs.com/package/screenshot-promise worked better although this code below still slows my computer down a lot!
const screenshotPromise = require('screenshot-promise');
...
urls.forEach(function(url) {
const promise = screenshotPromise(url, '1024x768', {crop: true}).then(buf => {
fs.writeFileSync(urlp.parse(url).hostname + 'test-1024x768.png', buf);
});
promise.then((value) => {
// value is whatever we passed in the resolve(...) function above.
// It doesn't have to be a string, but if it is only a succeed message, it probably will be.
console.log(value);
});

What you failed to add here is the error :
events.js:160
throw er; // Unhandled 'error' event
^
Error: Couldn't load url: http://techcrunch.com/2012/02/16/auraslate-is-an-open-source-android-tablet-for-hackers/
at LineStream.byline.on.data
(e:---\node_modules\screenshot-stream\index.js:77:16)
at emitOne (events.js:96:13)
at LineStream.emit (events.js:188:7)
The issue is that the module screenshot-stream is using PhantomJS, and phantomJS is unable to get to the page that outputs the error.
This error seems related to this issue : https://github.com/ariya/phantomjs/issues/10460.
Techcrunch.com and Aol.com seem to use web fonts (e.g. "BebasNeue-webfont.ttf") which Qt loads as application fonts. Something may be going wrong there.
My suggestion is using Google's Puppeteer that includes a built-in screenshot method : https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#pagescreenshotoptions

Code I did in the end that worked:
const puppeteer = require('puppeteer');
const urlp = require('url');
var URL = require('url-parse');
var urls = [
'https://archive.org/details/8bitrecs',
'http://hackaday.com/',
'http://techcrunch.com/2012/02/16/auraslate-is-an-open-source-android-tablet-for-hackers/',
'http://www.english.illinois.edu/-people-/faculty/debaron/482/482readings/greenfield.html',
'http://sustain.rca.ac.uk/Sustain-Talks',
'https://www.quintessentially.com/',
'https://www.producthunt.com/tech/ux-project-checklist',
'https://freedom.press/',
'http://issuu.com/search?q=vintage+motorcycle',
'http://www.pocketmod.com/v2/',
'https://www.metamind.io/',
'http://nautil.us/blog/chernobyls-hot-mess-the-elephants-foot-is-still-lethal',
'https://www.instructables.com/id/Tool-Storage-Hacks-or-How-to-Hang-Those-Black-Frid/',
'https://www.zippi.co.uk/framed-photo-print'
];
var getLocation = function(href) {
var l = document.createElement("a");
l.href = href;
return l;
};
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.waitForNavigation({
timeout: 40000
});
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
var url1 = new URL(url);
try {
await page.goto(`${url}`);
await page.screenshot({
path: 'images/' + url1.hostname + '.png'
});
} catch (error) {
console.log(error.message);
// await page.close();
// await browser.close();
// process.exit(1);
continue;
}
}
})();

Related

Downloading many images with node.js + axios stops downloading suddenly after a while with no errors

its that time again when I'm clueless & come humbly to ask for help!
I am trying to download 4500 images at once, average 1mb size, all the images get created & download starts, after about 2gb downloaded (so half) some images are complete, some partial, some empty, task manager confirms the download stops suddenly.
What could possibly be the issue? No matter how much I wait, nothing happens, at least if I got an error I would try something else...
Please advice if possible, thank you!
//get all json files from a folder
const fs = require("fs");
const path = require("path");
const axios = require("axios");
let urlsArray = [];
const collection = "rebels";
const folder = collection + "_json";
const getFiles = (folder) => {
const directoryPath = path.join(__dirname, folder);
return fs.readdirSync(directoryPath);
};
const files = getFiles(folder);
//inside the folder there are json files with metadata
//for each json file parse it and get the image url
files.forEach((file) => {
const filePath = path.join(__dirname, folder, file);
const fileContent = fs.readFileSync(filePath, "utf8");
const parsedJson = JSON.parse(fileContent);
const imageFromMetadata = parsedJson.image;
const url = imageFromMetadata.replace("ipfs://", "https://ipfs.io/ipfs/");
let nr = file.replace(".json", "");
urlsArray.push({ url, nr });
});
//foreach url create a promise to download with axios
const downloadImage = (url, nr) => {
const writer = fs.createWriteStream(
process.cwd() + `/${collection}_images2/${nr}.png`
);
return axios({
url,
method: "GET",
responseType: "stream",
}).then((response) => {
return new Promise((resolve, reject) => {
response.data.pipe(writer);
writer.on("finish", resolve);
writer.on("error", reject);
});
});
};
const promiseAll = async () => {
const promises = urlsArray.map((data) => {
console.log(`trying to download image nr ${data.nr} from ${data.url}`);
return downloadImage(data.url, data.nr);
});
await Promise.allSettled(promises);
};
promiseAll();
//download all
Since Promise.allSettled() never rejects, nothing in your code will report on any rejected promises that it sees. So, I'd suggest you iterate its results and see if you have any rejected promises there.
You can do that like this:
const results = await Promise.allSettled(promises);
console.log(`results.length = ${results.length}`);
for (const r of results) {
if (r.status === "rejected") {
console.log(r.reason);
}
}
console.log("all done");
This will verify that you got through the end of the Promise.allSettled(promises) and will verify that you got non-zero results and will log any rejected promises you got.

fs.writeFile crashes node app after writing first json file

I'm trying to crawl several web pages to check broken links and writing the results of the links to a json files, however, after the first file is completed the app crashes with no error popping up...
I'm using Puppeteer to crawl, Bluebird to run each link concurrently and fs to write the files.
WHAT IVE TRIED:
switching file type to '.txt' or '.php', this works but I need to create another loop outside the current workflow to convert the files from '.txt' to '.json'. Renaming the file right after writing to it also causes the app to crash.
using try catch statements for fs.writeFile but it never throws an error
the entire app outside of express, this worked at some point but i trying to use it within the framework
const express = require('express');
const router = express.Router();
const puppeteer = require('puppeteer');
const bluebird = require("bluebird");
const fs = require('fs');
router.get('/', function(req, res, next) {
(async () => {
// Our (multiple) URLs.
const urls = ['https://www.testing.com/allergy-test/', 'https://www.testing.com/genetic-testing/'];
const withBrowser = async (fn) => {
const browser = await puppeteer.launch();
try {
return await fn(browser);
} finally {
await browser.close();
}
}
const withPage = (browser) => async (fn) => {
const page = await browser.newPage();
// Turns request interceptor on.
await page.setRequestInterception(true);
// Ignore all the asset requests, just get the document.
page.on('request', request => {
if (request.resourceType() === 'document' ) {
request.continue();
} else {
request.abort();
}
});
try {
return await fn(page);
} finally {
await page.close();
}
}
const results = await withBrowser(async (browser) => {
return bluebird.map(urls, async (url) => {
return withPage(browser)(async (page) => {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 0 // Removes timeout.
});
// Search for urls we want to "crawl".
const hrefs = await page.$$eval('a[href^="https://www.testing.com/"]', as => as.map(a => a.href));
// Predefine our arrays.
let links = [];
let redirect = [];
// Loops through each /goto/ url on page
for (const href of Object.entries(hrefs)) {
response = await page.goto(href[1], {
waitUntil: 'domcontentloaded',
timeout: 0 // Remove timeout.
});
const chain = response.request().redirectChain();
const link = {
'source_url': href[1],
'status': response.status(),
'final_url': response.url(),
'redirect_count': chain.length,
};
// Loops through the redirect chain for each href.
for ( const ch of chain) {
redirect = {
status: ch.response().status(),
url: ch.url(),
};
}
// Push all info of target link into links
links.push(link);
}
// JSONify the data.
const linksJson = JSON.stringify(links);
fileName = url.replace('https://www.testing.com/', '');
fileName = fileName.replace(/[^a-zA-Z0-9\-]/g, '');
// Write data to file in /tmp directory.
fs.writeFile(`./tmp/${fileName}.json`, linksJson, (err) => {
if (err) {
return console.log(err);
}
});
});
}, {concurrency: 4}); // How many pages to run at a time.
});
})();
});
module.exports = router;
UPDATE:
So there is nothing wrong with my code... I realized nodemon was stopping the process after each file was saved. Since nodemon would detect a "file change" it kept restarting my server after the first item

Controlling margins when making a PDF using Playwright

When making a PDF from a headless chrome Playwright session (code below) I get the PDF on the left, whereas if I make it through the save to pdf in chrome, I get the second output.
I have set the margins explicitly and used preferCSSPageSize: true and both give the same (left) outcome.
How would I get Playwright to give me the same output as chrome does from the print dialog?
An example of the files being printed is here. (In real life, they're all slightly different to account for the spine width.)
const fs = require("fs");
const path = require("path");
const { chromium } = require("playwright");
const directoryPath = "outside";
(async () => {
const filesToPrint = getFilesToPrintList(directoryPath);
filesToPrint.forEach((f, i) => console.log(i, f));
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
for (let i = 0; i < filesToPrint.length; i++) {
const htmlFilename = path.join(directoryPath, filesToPrint[i]);
const pdfFilename = makePDFfilename(htmlFilename);
console.log("[html file]", htmlFilename);
console.log("[pdf file]", pdfFilename);
const page = await context.newPage();
await page.goto(
"file:///" + path.resolve(htmlFilename),
(waitUntil = "networkidle")
);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true,
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
}
await browser.close();
console.log("done");
})();
function makePDFfilename(htmlFilename) {
const parts = htmlFilename.split(/[\\\._]/);
const pdfFilename = path.join(
directoryPath,
`Experimental_${parts[1]}_${parts[2]}.pdf`
);
return pdfFilename;
}
function getFilesToPrintList(directoryPath) {
let theFiles = fs
.readdirSync(directoryPath)
.filter((f) => f.includes("fp_") && f.includes(".html"));
return theFiles;
}
I tried to reproduce your issue:
I used a different background image URL (online) with your HTML page, see test.html gist.
Apparently, a number of options are the defaults. See page.pdf().
Also, added browser.newContext() and browser.close().
Here's the minimal working NodeJS code:
generate-pdf.js
const { chromium } = require("playwright");
(async () => {
const htmlFilename = "file:///<file-path>/test.html";
console.log("[webpage]", htmlFilename);
const browser = await chromium.launch();
const context = await browser.newContext();
const page = await context.newPage();
await page.goto(htmlFilename);
const pdfFilename = "./test.pdf";
console.log("[pdf file]", pdfFilename);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
await browser.close();
console.log("done");
})();
Console Output:
$ node generate-pdf.js
[webpage] file:///<file-path>/test.html
[pdf file] ./test.pdf
About to print:
./test.pdf
with:
{
"path": "./test.pdf",
"pageRanges": "1",
"preferCSSPageSize": true
}
done
Snapshot of PDF (test.pdf):
You might want to test this separately and see if it works for your use-case.

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

Node js request huge amount of pictures

I am trying to request a huge amount of images on node js (n > 3000)
and then save them into a folder.
I have tried using request library and request-promise.
The problem is that if the number of pictures is too big some pictures do not complete downloading, leaving incomplete data or get an error(an empty .jpeg file). Is there a better way of downloading huge amounts of pictures?
Also i need when the pictures all download to request a function compile()
to make them into a video. I am using ffmpeg for that.
Below are 2 ways i tried doing this.
const req = require('request');
const request = require('request-promise');
const fs = require('fs');
const ffmpegPath = require('#ffmpeg-installer/ffmpeg').path;
const ffmpeg = require('fluent-ffmpeg');
ffmpeg.setFfmpegPath(ffmpegPath);
function downloadImgs(imgUrls) {
let promises = [];
let prom;
for (let i = 0; i < imgUrls.length; i++) {
imgPath = `assets/pics/st_${pad(i + 1, 3)}.jpg`
prom = request(imgUrls[i]);
prom.on('error', () => console.log('err'));
prom.pipe(fs.createWriteStream(imgPath));
promises.push(prom);
}
Promise.all(promises).then(() => {
console.log('I Run');
compilee();
});
//SECOND TRY-----------------------------------------
for (let i = 0; i < imgUrls.lengh; i++) {
imgUrl = imgUrls[i];
req(imgUrl)
.on('error', () => console.log("img error", imgUrl))
.pipe(fs.createWriteStream(`assets/pics/st_${pad(i + 1, 3)}.jpg`)).on('close', () => {
console.log('downloaded', x)
})
}
compilee();
//---------------------------------------------------------
function compilee() {
command
.on('end', onEnd)
// .on('progress', onProgress)
.on('error', onError)
.input('assets/pics/st_%03d.jpg')
.inputFPS(5)
.output('assets/output/pepe.mp4')
.outputFps(30)
.run();
}
The errors i am getting for the first: UnhandledPromiseRejectionWarning: RequestError: Error: socket hang up
and the second :
Error: socket hang up
at createHangUpError (_http_client.js:330:15)
at TLSSocket.socketOnEnd (_http_client.js:433:23)
at TLSSocket.emit (events.js:187:15)
at endReadableNT (_stream_readable.js:1098:12)
at process.internalTickCallback
(internal/process/next_tick.js:72:19) code: 'ECONNRESET' }
Also, should i download the pictures on the server or on the client if i want to sent the video to the client immediately.
I may suggest you to use async library. Que might be a good solution
var q = async.queue(function({url,index}, callback) {
// Do ur job here
}, 5);
// assign a callback
q.drain = function() {
console.log('All images downloaded');
};
// add items to the queue
for (let i = 0; i < imgUrls.length; i++) {
q.push({url : imgUrls[i], index : i});
}
Ofc it is not a working all code block but it is just to give you the idea. Hope it helps

Resources