I'm trying to crawl several web pages to check broken links and writing the results of the links to a json files, however, after the first file is completed the app crashes with no error popping up...
I'm using Puppeteer to crawl, Bluebird to run each link concurrently and fs to write the files.
WHAT IVE TRIED:
switching file type to '.txt' or '.php', this works but I need to create another loop outside the current workflow to convert the files from '.txt' to '.json'. Renaming the file right after writing to it also causes the app to crash.
using try catch statements for fs.writeFile but it never throws an error
the entire app outside of express, this worked at some point but i trying to use it within the framework
const express = require('express');
const router = express.Router();
const puppeteer = require('puppeteer');
const bluebird = require("bluebird");
const fs = require('fs');
router.get('/', function(req, res, next) {
(async () => {
// Our (multiple) URLs.
const urls = ['https://www.testing.com/allergy-test/', 'https://www.testing.com/genetic-testing/'];
const withBrowser = async (fn) => {
const browser = await puppeteer.launch();
try {
return await fn(browser);
} finally {
await browser.close();
}
}
const withPage = (browser) => async (fn) => {
const page = await browser.newPage();
// Turns request interceptor on.
await page.setRequestInterception(true);
// Ignore all the asset requests, just get the document.
page.on('request', request => {
if (request.resourceType() === 'document' ) {
request.continue();
} else {
request.abort();
}
});
try {
return await fn(page);
} finally {
await page.close();
}
}
const results = await withBrowser(async (browser) => {
return bluebird.map(urls, async (url) => {
return withPage(browser)(async (page) => {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 0 // Removes timeout.
});
// Search for urls we want to "crawl".
const hrefs = await page.$$eval('a[href^="https://www.testing.com/"]', as => as.map(a => a.href));
// Predefine our arrays.
let links = [];
let redirect = [];
// Loops through each /goto/ url on page
for (const href of Object.entries(hrefs)) {
response = await page.goto(href[1], {
waitUntil: 'domcontentloaded',
timeout: 0 // Remove timeout.
});
const chain = response.request().redirectChain();
const link = {
'source_url': href[1],
'status': response.status(),
'final_url': response.url(),
'redirect_count': chain.length,
};
// Loops through the redirect chain for each href.
for ( const ch of chain) {
redirect = {
status: ch.response().status(),
url: ch.url(),
};
}
// Push all info of target link into links
links.push(link);
}
// JSONify the data.
const linksJson = JSON.stringify(links);
fileName = url.replace('https://www.testing.com/', '');
fileName = fileName.replace(/[^a-zA-Z0-9\-]/g, '');
// Write data to file in /tmp directory.
fs.writeFile(`./tmp/${fileName}.json`, linksJson, (err) => {
if (err) {
return console.log(err);
}
});
});
}, {concurrency: 4}); // How many pages to run at a time.
});
})();
});
module.exports = router;
UPDATE:
So there is nothing wrong with my code... I realized nodemon was stopping the process after each file was saved. Since nodemon would detect a "file change" it kept restarting my server after the first item
Related
I have a page on this link (https://master.d3tei1upkyr9mb.amplifyapp.com/report) with 3 export buttons.
These export buttons generate XLSX, CSV, PDF on the frontend, and hence there are no URLs for XLSX, CSV, PDF.
I need puppeteer to be able to download or get or intercept the blobs or buffers of these files in my node backend.
I tried different ways to achieve this but still haven't figured out.
It was possible through playwright library through the code written below. But I need to be able to do it with Puppeteer.
const {chromium} = require('playwright');
const fs = require('fs');
(async () => {
const browser = await chromium.launch();
const context = await browser.newContext({acceptDownloads: true});
const page = await context.newPage();
await page.goto('http://localhost:3000/');
const [ download ] = await Promise.all([
page.waitForEvent('download'), // <-- start waiting for the download
page.click('button#expoXLSX') // <-- perform the action that directly or indirectly initiates it.
]);
const path = await download.path();
console.log(path);
const newFile = await fs.readFileSync(path);
console.log(newFile);
fs.writeFile("test.xlsx", newFile, "binary",function(err) {
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
await browser.close()
})();
Is there any way?
Any reason not to simulate the click on the frontend and allow puppeteer download the file to the location of your choice? You can easily download the file this way with the following:
Edit: You can determine when the file download completes by listening to the Page.downloadProgress event and checking for the completed state. Getting the actual filename saved to disk isn't 100% guaranteed with this method, but you are able to get what is termed the suggestedFileName from the Page.downloadWillBegin event, which in my tests thus far (at least on the example page in the question) does match the filename persisted to disk.
const puppeteer = require('puppeteer');
const path = require('path');
const downloadPath = path.resolve('./download');
(async ()=> {
let fileName;
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(
'https://master.d3tei1upkyr9mb.amplifyapp.com/report',
{ waitUntil: 'networkidle2' }
);
await page._client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadPath
});
await page._client.on('Page.downloadWillBegin', ({ url, suggestedFilename }) => {
console.log('download beginning,', url, suggestedFilename);
fileName = suggestedFilename;
});
await page._client.on('Page.downloadProgress', ({ state }) => {
if (state === 'completed') {
console.log('download completed. File location: ', downloadPath + '/' + fileName);
}
});
await page.click('button#expoPDF');
})();
I'm trying to manipulate forms of sites w/ iFrames in it using Puppeteer. I tried different ways to reach a specific iFrame, or even to count iFrames in a website, with no success.
Why isn't Puppeteer's object recognizing the iFrames / child frames of the page I'm trying to navigate through?
It's happening with other pages as well, such as https://www.veiculos.itau.com.br/simulacao
const Apify = require('apify');
const sleep = require('sleep-promise');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
// Create and navigate new page
console.log('Open target page');
const page = await browser.newPage();
await page.goto('https://www.credlineitau.com.br/');
await sleep(15 * 1000);
for (const frame in page.mainFrame().childFrames()) {
console.log('test');
}
await browser.close();
});
Perhaps you'll find some helpful inspiration below.
const waitForIframeContent = async (page, frameSelector, contentSelector) => {
await page.waitForFunction((frameSelector, contentSelector) => {
const frame = document.querySelector(frameSelector);
const node = frame.contentDocument.querySelector(contentSelector);
return node && node.innerText;
}, {
timeout: TIMEOUTS.ten,
}, frameSelector, contentSelector);
};
const $frame = await waitForSelector(page, SELECTORS.frame.iframeNode).catch(() => null);
if ($frame) {
const frame = page.frames().find(frame => frame.name() === 'content-iframe');
const $cancelStatus = await waitForSelector(frame, SELECTORS.frame.membership.cancelStatus).catch(() => null);
await waitForIframeContent(page, SELECTORS.frame.iframeNode, SELECTORS.frame.membership.cancelStatus);
}
Give it a shot.
I try to create some API to external adobe stock.
Like in the title, first time i get query from Link router of undefined, but after reload page it work correctly. My
main page
<Link
href={{
pathname: "/kategoria-zdjec",
query: images.zdjecia_kategoria
}}
as={`/kategoria-zdjec?temat=${images.zdjecia_kategoria}`}
className={classes.button}>
</Link>
and my server
app
.prepare()
.then(() => {
server.get("/kategoria-zdjec", async (req, res) => {
const temat = await req.query.temat;
console.log(temat)
const url = `https://stock.adobe.io/Rest/Media/1/Search/Files?locale=pl_PL&search_parameters[words]=${temat}&search_parameters[limit]=24&search_parameters[offset]=1`;
try {
const fetchData = await fetch(url, {
headers: { ... }
});
const objectAdobeStock = await fetchData.json();
res.json(objectAdobeStock);
const totalObj = await objectAdobeStock.nb_results;
const adobeImages = await objectAdobeStock.files;
} catch (error) {
console.log(error);
}
});
and that looks like getInitialProps on page next page
Zdjecia.getInitialProps = async ({req}) => {
const res = await fetch("/kategoria-zdjec");
const json = await res.json();
return { total: json.nb_results, images: json.files };
}
I think it is problem due asynchronous.
I think this might be due to the fact that you are using fetch which is actually part of the Web API and this action fails when executed on server.
You could either use isomorphic-fetch which keeps fetch API consistent between client and server, or use node-fetch when fetch is called on the server:
Zdjecia.getInitialProps = async ({ req, isServer }) => {
const fetch = isServer ? require('node-fetch') : window.fetch;
const res = await fetch("/kategoria-zdjec");
const json = await res.json();
return { total: json.nb_results, images: json.files };
}
This problem is solved, the issue was in another part of my app, directly in state management, just created new variables, and pass to link state value.
I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here
I have a use case that needs to use Headless Chrome Network (https://chromedevtools.github.io/devtools-protocol/tot/Network/) to intercept all images requests and find out the image size before saving it (basically discard small images such as icons).
However, I am unable to figure out a way to load the image data in memory before saving it. I need to load it in Img object to get width and height. The Network.getResponseBody is taking requestId which I don't have access in Network.requestIntercepted. Also Network.loadingFinished always gives me "0" in encodedDataLength variable. I have no idea why. So my questions are:
How to intercept all responses from jpg/png request and get the image data? Without saving the file via URL string to the disk and load back.
BEST: how to get image dimension from header response? Then I don't have to read the data into memory at all.
My code is below:
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');
(async function() {
async function launchChrome() {
return await chromeLauncher.launch({
chromeFlags: [
'--disable-gpu',
'--headless'
]
});
}
const chrome = await launchChrome();
const protocol = await CDP({
port: chrome.port
});
const {
DOM,
Network,
Page,
Emulation,
Runtime
} = protocol;
await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
await Network.setRequestInterceptionEnabled({enabled: true});
Network.requestIntercepted(({interceptionId, request, resourceType}) => {
if ((request.url.indexOf('.jpg') >= 0) || (request.url.indexOf('.png') >= 0)) {
console.log(JSON.stringify(request));
console.log(resourceType);
if (request.url.indexOf("/unspecified.jpg") >= 0) {
console.log("FOUND unspecified.jpg");
console.log(JSON.stringify(interceptionId));
// console.log(JSON.stringify(Network.getResponseBody(interceptionId)));
}
}
Network.continueInterceptedRequest({interceptionId});
});
Network.loadingFinished(({requestId, timestamp, encodedDataLength}) => {
console.log(requestId);
console.log(timestamp);
console.log(encodedDataLength);
});
Page.navigate({
url: 'https://www.yahoo.com/'
});
Page.loadEventFired(async() => {
protocol.close();
chrome.kill();
});
})();
This should get you 90% of the way there. It gets the body of each image request. You'd still need to base64decode, check size and save etc...
const CDP = require('chrome-remote-interface');
const sizeThreshold = 1024;
async function run() {
try {
var client = await CDP();
const { Network, Page } = client;
// enable events
await Promise.all([Network.enable(), Page.enable()]);
// commands
const _url = "https://google.co.za";
let _pics = [];
Network.responseReceived(async ({requestId, response}) => {
let url = response ? response.url : null;
if ((url.indexOf('.jpg') >= 0) || (url.indexOf('.png') >= 0)) {
const {body, base64Encoded} = await Network.getResponseBody({ requestId }); // throws promise error returning null/undefined so can't destructure. Must be different in inspect shell to app?
_pics.push({ url, body, base64Encoded });
console.log(url, body, base64Encoded);
}
});
await Page.navigate({ url: _url });
await sleep(5000);
// TODO: process _pics - base64Encoded, check body.length > sizeThreshold, save etc...
} catch (err) {
if (err.message && err.message === "No inspectable targets") {
console.error("Either chrome isn't running or you already have another app connected to chrome - e.g. `chrome-remote-interface inspect`")
} else {
console.error(err);
}
} finally {
if (client) {
await client.close();
}
}
}
function sleep(miliseconds = 1000) {
if (miliseconds == 0)
return Promise.resolve();
return new Promise(resolve => setTimeout(() => resolve(), miliseconds))
}
run();