Puppeteer: return JSON response of AJAX response - node.js

while the page is loading I am trying to wait for a certain AJAX request made by my page and then return its response's JSON body. My code does not stop iterating through every response even after the condition is met within the listener for 'response' event.
Once I find the response I want to return, how can I capture the JSON from the response, stop execution the page from loading further, and return my JSON?
async function runScrape() {
const browser = await browserPromise;
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
await page.setDefaultTimeout(60000);
let apiResponse;
page.on('response', async response => {
let url = await response.url();
let status = await response.status();
console.info(status + " NETWORK CALL: " + url);
if ( url.match(requestPattern) ) {
apiResponse = await response.text();
await page.evaluate(() => window.stop());
}
});
await page.goto(req.query.url);
console.log("API RESPONSE:\n" + apiResponse);
return apiResponse
}}
=== UPDATE ===
This was the solution that ended up working. It seemed this approach was required due to the specific behavior of the page being scraped.
async function runScrape() {
const browser = await browserPromise;
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
await page.setDefaultTimeout(60000);
await page.setRequestInterception(true);
let JSONResponse;
page.on('response', async response => {
if ( !JSONResponse && response.url().match(requestPattern) ) {
JSONResponse = await response.text();
}
});
page.on('request', request => {
if (request.resourceType() === 'image' || request.resourceType() === 'stylesheet') request.abort()
else request.continue()
});
await page.goto(scrapeURL, {waitUntil: 'networkidle2'});
await page.close();
return JSONResponse
}
runScrape()
.then( response => {
res.setHeader("content-type", "application/json");
res.status(200).send(response);
})
.catch(err => {
let payload = {"errorType": err.name, "errorMessage": err.message+"\n"+err.stack};
console.error(JSON.stringify(payload));
res.status(500).json(payload);
});

I would simplify it to a single page.on('response'... where we are looking for the desired request pattern with String.includes().
Once the response is identified then we can emulate the "Stop loading this page" button of the browser with await page.evaluate(() => window.stop()). The window.stop() method won't close the browser yet, just stops the network requests.
let resp
page.on('response', async response => {
if (response.url().includes(requestPattern)) {
resp = await response.json()
await page.evaluate(() => window.stop())
}
})
await page.goto(req.query.url, { waitUntil: 'networkidle0' } )
console.log(resp)
Edit:
To avoid undefined response you should use waitUntil: 'networkidle0' setting on page.goto(), see the docs about the options. You've got undefined because by default puppeteer considered page to be loaded when the load event is fired on the page (this is the default setting of waitUntil). So if the page considered loaded but there are still network connections in the queue and your request pattern is not found yet: the script will go on from goto to console.log. So you make sure the request is registered before it would happen by waiting until all network request has been finished.
networkidle0: consider navigation to be finished when there are no more than 0 network connections for at least 500 ms.
Please note: by setting networkidle you won't be able to disconnect after the request pattern condition was fulfilled, so your plan to stop the responses won't be possible.
I recommend to abort those resourceTypes which are not needed, like this you may have similar results as you would with stopping the requests:
For example:
Place it right after the page.on('response', async response => {... block ended.
await page.setRequestInterception(true)
page.on('request', request => {
if (request.resourceType() === 'image' || request.resourceType() === 'stylesheet') request.abort()
else request.continue()
})
You can use it with a request.url().includes(unwantedRequestPattern) condition as well if you know which connections you don't need.

Related

Puppeteer - ExpressJS infinite loop a function

I have a problem with my Express JS app : When I'm trying to call a function, this function is endlessly called ... It opens a lot of chromium browser and cause performance issues ...
I just want to call this function one time.
I've found a solution to make it work (And called just one time), but in this situation I can't pass any parameters ...
const farm = (async () => {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.goto("https://www.example.com/?s=" + term);
await page.waitForSelector("div");
const postLinks = await page.evaluate(() => {
let postLinks = [];
let elements = document.querySelectorAll('div.article');
for (element of elements) {
postLinks.push({
title: element.querySelector('div.meta-info > h3 > a')?.textContent,
url: element.querySelector('div.meta-info > h3 > a')?.href
})
}
return postLinks;
});
console.log(postLinks);
await browser.close();
})();
app.get('/', (req, res) => {
var term = "Drake";
res.send(farm);
});
With the code below, I can pass parameters but I can't return the result in "res.send", and the function is called endlessly :
const farm = async (term) => {
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.goto("https://www.example.com/?s=" + term);
await page.waitForSelector("div");
const postLinks = await page.evaluate(() => {
let postLinks = [];
let elements = document.querySelectorAll('div.article');
for (element of elements) {
postLinks.push({
title: element.querySelector('div.meta-info > h3 > a')?.textContent,
url: element.querySelector('div.meta-info > h3 > a')?.href
})
}
return postLinks;
});
console.log(postLinks);
await browser.close();
}
app.get('/', (req, res) => {
var term = "Drake";
var results = farm(term);
res.send(results);
});
Did I miss something ?
Thanks !
It's not an infinite loop, but unresolved promise. The farm returns a promise, which you're not waiting for, but instead send the pending promise before it resolves, i.e. before the puppeteer is done.
You need to wait for farm's promise to resolve, make middleware function async and add await to the farm call:
app.get('/', async(req, res) => {
var term = "Drake";
// farm returns a promise, so you need to wait for it to resolve, i.e. block execution
// otherwise it just sends pending promise, because node.js runs in non-blocking fashion
var results = await farm(term);
res.send(results);
});

How to get the download stream (buffer) using puppeteer?

I want to get the download content (buffer) and after soon, store the data at my S3 account. So far I wasn't able to find out some solution... Looking for some examples in the web, I noticed that there is a lot of people with this problem. I tried (unsuccessfully) to use the page.on("response") event to retrieve the raw response content, acording the following snippet:
const bucket = [];
await page.on("response", async response => {
const url = response.url();
if (
url ===
"https://the.earth.li/~sgtatham/putty/0.71/w32/putty-0.71-installer.msi"
) {
try {
if (response.status() === 200) {
bucket.push(await response.buffer());
console.log(bucket);
// I got the following: 'Protocol error (Network.getResponseBody): No resource with given identifier found' }
}
} catch (err) {
console.error(err, "ERROR");
}
}
});
With such code above, I would intend to detect the event of the download dialog and then, in some way, be able to receive the binary content.
I'm not sure if that's the correct approach. I noticed that some people use a solution based on reading files, in the other words, after download finishes, them read the stored file from the disk. There is a similar discussion at: https://github.com/GoogleChrome/puppeteer/issues/299.
My question is: Is there some way (using puppeteer), to intercept the download stream without having to save the file to disk before?
Thank you very much.
The problem is, that the buffer is cleared as soon as any kind of navigation request is happening. This might be a redirect or page reload in your case.
To solve this problem, you need to make sure that the page does not make any navigation requests as long as you have not finished downloading your resource. To do this we can use page.setRequestInterception.
There is a simple solutions, which might get you started, but might not always work and a more complex solution to this problem.
Simple solution
This solution cancels any navigation requests after the initial request. This means, any reload or navigation on the page will not work. Therefore the buffers of the resources are not cleared.
const browser = await puppeteer.launch();
const [page] = await browser.pages();
let initialRequest = true;
await page.setRequestInterception(true);
page.on('request', request => {
// cancel any navigation requests after the initial page.goto
if (request.isNavigationRequest() && !initialRequest) {
return request.abort();
}
initialRequest = false;
request.continue();
});
page.on('response', async (response) => {
if (response.url() === 'RESOURCE YOU WANT TO DOWNLOAD') {
const buffer = await response.buffer();
// handle buffer
}
});
await page.goto('...');
Advanced solution
The following code will process each request one after another. In case you download the buffer it will wait until the buffer is downloaded before processing the next request.
const browser = await puppeteer.launch();
const [page] = await browser.pages();
let paused = false;
let pausedRequests = [];
const nextRequest = () => { // continue the next request or "unpause"
if (pausedRequests.length === 0) {
paused = false;
} else {
// continue first request in "queue"
(pausedRequests.shift())(); // calls the request.continue function
}
};
await page.setRequestInterception(true);
page.on('request', request => {
if (paused) {
pausedRequests.push(() => request.continue());
} else {
paused = true; // pause, as we are processing a request now
request.continue();
}
});
page.on('requestfinished', async (request) => {
const response = await request.response();
if (response.url() === 'RESOURCE YOU WANT TO DOWNLOAD') {
const buffer = await response.buffer();
// handle buffer
}
nextRequest(); // continue with next request
});
page.on('requestfailed', nextRequest);
await page.goto('...');

Puppeteer when error go to previous function or call

i want to make a pupeteer script that if the page is error or in some case facing a connection issues for example "Aw Snap!" in chrome my script will reload/refresh/navigate to the original destination target url. so in .catch() command have to call some refresh/re-navigate function. for now i only can show text message when catch an error with console.log. here's my code
const puppeteer = require('puppeteer');
puppeteer.launch({headless:false}).then(async browser => {
const page = await browser.newPage();
await page.on('error',async err => {console.log('on page.on');});
await page.goto('https://www.google.com').then(async ()=> {
while(1){
await page.waitForSelector("img",{timeout:7000})
.then(async () => {
await page.evaluate(() => {
return document.querySelector('div.jsb input[name="btnI"]').value;
}).then(abc => {
console.log(abc);
})
.catch(err => console.log('input button not found!!'));
})
.catch(err => console.log('selector not found!!'));
}
})
.catch(err => console.log(err));
});
so what i want is :
when 'input button not found!!' it mean something happen to the google page either connection issue or something. i need to re-visit url https://www.google.com when 'input button not found!!' triggered. How to do that? i don't want to manually write it twice by replacing text 'input button not found!!' into await page.goto('https://www.google.com')
what i want is dynamic solution like placing some function() , etc
Thank you.

How to catch an exception inside an event listener?

I use Puppeteer library to open an URL and process all requests' responses. Sometimes inside the event listener page.on('response') I need to throw an error like in the example below. But I'm unable to catch these exceptions in any way, I always got the unhandled promise rejection error. How can I handle these exceptions? I don't want to use process.on('unhandledRejection') because it doesn't solve my problem at all.
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', (request) => {
throw 'response error';
});
await page.goto('http://google.com/');
browser.close();
} catch (e) {}
})();
Although the function of your page.on handler is located inside a try..catch block, the function is executed asynchronously and therefore any error thrown inside this function is not caught by the outer try..catch block.
You have to create another try..catch block inside your function.
Example
const puppeteer = require('puppeteer');
function handleError(err) {
// ...
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', (request) => {
try { // inner try catch block to catch errors inside this function
// ...
throw 'response error';
} catch (err) {
// Handle the error here, or pass it to another function:
handleError(err);
}
});
await page.goto('http://google.com/');
browser.close();
} catch (e) {}
})();
I would never put responses in event handlers as you will most definately run into the problem of express trying to send multiple responses to a user resulting in an error (except if you create a an event handler that manages if it has been called before and then supress sending a response but that is also not pretty). I would use a Promise.race condition.
This would wait for the first one of your promises to either reject or resolve. Even though your page.on cannot resolve that's ok because page.goto should do that or also reject.
So this would look somewhat like this
try {
await Promise.race([
new Promise((res,rej) => page.on('error', error => rej(error))),
page.goto('http://google.com/')
]);
// Do stuff if got succeeds
browser.close();
catch (err) {
// do stuff with your error
browser.close();
}
All you need to do to avoid that error is to catch the result of the async function, which is a Promise, and handle it somehow, even if just piping it through console.error.
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', (request) => {
throw 'response error';
});
await page.goto('http://google.com/');
browser.close();
} catch (e) {}
})().catch(console.error);

async/await issues with Chrome remote interface

I'd like to test this piece of code and wait until it's done to assert the results. Not sure where the issue is, it should return the Promise.resolve() at the end, but logs end before the code is executed.
Should Page.loadEventFired also be preceded by await?
const CDP = require('chrome-remote-interface')
async function x () {
const protocol = await CDP()
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms))
// See API docs: https://chromedevtools.github.io/devtools-protocol/
const { Page, Runtime, DOM } = protocol
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()])
Page.navigate({ url: 'http://example.com' })
// wait until the page says it's loaded...
return Page.loadEventFired(async () => {
console.log('Page loaded! Now waiting a few seconds for all the JS to load...')
await timeout(3000) // give the JS some time to load
protocol.close()
console.log('Processing page source...')
console.log('Doing some fancy stuff here ...')
console.log('All done.')
return Promise.resolve()
})
}
(async function () {
console.log('start')
await x()
console.log('end')
})()
Yes you should await for Page.loadEventFired Example
async function x () {
const protocol = await CDP()
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms))
// See API docs: https://chromedevtools.github.io/devtools-protocol/
const { Page, Runtime, DOM } = protocol
await Promise.all([Page.enable(), Runtime.enable(), DOM.enable()])
await Page.navigate({ url: 'http://example.com' })
// wait until the page says it's loaded...
await Page.loadEventFired()
console.log('Page loaded! Now waiting a few seconds for all the JS to load...')
await timeout(3000) // give the JS some time to load
protocol.close()
console.log('Processing page source...')
console.log('Doing some fancy stuff here ...')
console.log('All done.')
}
BTW you might also want to wrap your code with try-finally to always close protocol.
async function x () {
let protocol
try {
protocol = await CDP()
...
} finally {
if(protocol) protocol.close()
}

Resources