How to add data via post in a request in Puppeteer? - node.js

I am trying to scrape a webpage with Puppeteer. Enter, navigate through some pages and in the data pages (those that are paginated) add POST data (emulating the form).
The event to intercept the request can only be created once, so all calls will be affected by the data sent via POST. (Node Puppeteer, page.on( "request" ) throw a "Request is already handled!")
I didn't find much information on this (how do POST request in puppeteer?), and finally did the following:
Create a function that will always be called (on each request).
Query an attribute of the function to see if it has an object.
If you have it, embed the data via POST; and remove the attribute.
If the attribute does not exist, continue without embedding data.
const openConnection = async () => {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox"],
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on("request", requestPost);
return { browser, page };
};
const requestPost = async (req) => {
if (typeof requestPost.data === "object") {
requestPost.data.headers = { ...req.headers(), ...requestPost.data.headers };
await req.continue(requestPost.data);
delete requestPost.data;
} else {
await req.continue();
}
};
const getData = async (m, y, p, l) => {
const { browser, page } = await openConnection();
let data = [];
let pagina = p;
do {
/* JUST because this attribute is being created, the next request that is created in the page.goto() that follows, will be altered with these attributes */
requestPost.data = {
method: "POST",
postData: `&pagina=${pagina}&mes=${m}&year=${y}`,
headers: { "Content-Type": "application/x-www-form-urlencoded" },
};
await page.goto("https://url.com/info.cgi", { waitUntil: "networkidle2" });
// Now I work the data and add it to the end
// data = data.push();
pagina++;
} while (pagina < p + l);
await closeConnection(page, browser);
return data;
};

Related

fs.writeFile crashes node app after writing first json file

I'm trying to crawl several web pages to check broken links and writing the results of the links to a json files, however, after the first file is completed the app crashes with no error popping up...
I'm using Puppeteer to crawl, Bluebird to run each link concurrently and fs to write the files.
WHAT IVE TRIED:
switching file type to '.txt' or '.php', this works but I need to create another loop outside the current workflow to convert the files from '.txt' to '.json'. Renaming the file right after writing to it also causes the app to crash.
using try catch statements for fs.writeFile but it never throws an error
the entire app outside of express, this worked at some point but i trying to use it within the framework
const express = require('express');
const router = express.Router();
const puppeteer = require('puppeteer');
const bluebird = require("bluebird");
const fs = require('fs');
router.get('/', function(req, res, next) {
(async () => {
// Our (multiple) URLs.
const urls = ['https://www.testing.com/allergy-test/', 'https://www.testing.com/genetic-testing/'];
const withBrowser = async (fn) => {
const browser = await puppeteer.launch();
try {
return await fn(browser);
} finally {
await browser.close();
}
}
const withPage = (browser) => async (fn) => {
const page = await browser.newPage();
// Turns request interceptor on.
await page.setRequestInterception(true);
// Ignore all the asset requests, just get the document.
page.on('request', request => {
if (request.resourceType() === 'document' ) {
request.continue();
} else {
request.abort();
}
});
try {
return await fn(page);
} finally {
await page.close();
}
}
const results = await withBrowser(async (browser) => {
return bluebird.map(urls, async (url) => {
return withPage(browser)(async (page) => {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: 0 // Removes timeout.
});
// Search for urls we want to "crawl".
const hrefs = await page.$$eval('a[href^="https://www.testing.com/"]', as => as.map(a => a.href));
// Predefine our arrays.
let links = [];
let redirect = [];
// Loops through each /goto/ url on page
for (const href of Object.entries(hrefs)) {
response = await page.goto(href[1], {
waitUntil: 'domcontentloaded',
timeout: 0 // Remove timeout.
});
const chain = response.request().redirectChain();
const link = {
'source_url': href[1],
'status': response.status(),
'final_url': response.url(),
'redirect_count': chain.length,
};
// Loops through the redirect chain for each href.
for ( const ch of chain) {
redirect = {
status: ch.response().status(),
url: ch.url(),
};
}
// Push all info of target link into links
links.push(link);
}
// JSONify the data.
const linksJson = JSON.stringify(links);
fileName = url.replace('https://www.testing.com/', '');
fileName = fileName.replace(/[^a-zA-Z0-9\-]/g, '');
// Write data to file in /tmp directory.
fs.writeFile(`./tmp/${fileName}.json`, linksJson, (err) => {
if (err) {
return console.log(err);
}
});
});
}, {concurrency: 4}); // How many pages to run at a time.
});
})();
});
module.exports = router;
UPDATE:
So there is nothing wrong with my code... I realized nodemon was stopping the process after each file was saved. Since nodemon would detect a "file change" it kept restarting my server after the first item

Why am I not able to navigate through iFrames using Apify/Puppeteer?

I'm trying to manipulate forms of sites w/ iFrames in it using Puppeteer. I tried different ways to reach a specific iFrame, or even to count iFrames in a website, with no success.
Why isn't Puppeteer's object recognizing the iFrames / child frames of the page I'm trying to navigate through?
It's happening with other pages as well, such as https://www.veiculos.itau.com.br/simulacao
const Apify = require('apify');
const sleep = require('sleep-promise');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
// Create and navigate new page
console.log('Open target page');
const page = await browser.newPage();
await page.goto('https://www.credlineitau.com.br/');
await sleep(15 * 1000);
for (const frame in page.mainFrame().childFrames()) {
console.log('test');
}
await browser.close();
});
Perhaps you'll find some helpful inspiration below.
const waitForIframeContent = async (page, frameSelector, contentSelector) => {
await page.waitForFunction((frameSelector, contentSelector) => {
const frame = document.querySelector(frameSelector);
const node = frame.contentDocument.querySelector(contentSelector);
return node && node.innerText;
}, {
timeout: TIMEOUTS.ten,
}, frameSelector, contentSelector);
};
const $frame = await waitForSelector(page, SELECTORS.frame.iframeNode).catch(() => null);
if ($frame) {
const frame = page.frames().find(frame => frame.name() === 'content-iframe');
const $cancelStatus = await waitForSelector(frame, SELECTORS.frame.membership.cancelStatus).catch(() => null);
await waitForIframeContent(page, SELECTORS.frame.iframeNode, SELECTORS.frame.membership.cancelStatus);
}
Give it a shot.

Click anywhere on page using Puppeteer

Currently I'm using Puppeteer to fetch cookies & headers from a page, however it's using a bot prevention system which is only bypassed when clicking on the page; I don't want to keep this sequential so it's "detectable"
How can I have my Puppeteer click anywhere on the page at random? regardless of wether it clicks a link, button etc..
I've currently got this code
const getCookies = async (state) => {
try {
state.browser = await launch_browser(state);
state.context = await state.browser.createIncognitoBrowserContext();
state.page = await state.context.newPage();
await state.page.authenticate({
username: proxies.username(),
password: proxies.password(),
});
await state.page.setViewport(functions.get_viewport());
state.page.on('response', response => handle_response(response, state));
await state.page.goto('https://www.website.com', {
waitUntil: 'networkidle0',
});
await state.page.waitFor('.unlockLink a', {
timeout: 5000
});
await state.page.click('.unlockLink a');
await state.page.waitFor('input[id="nondevice"]', {
timeout: 5000
});
state.publicIpv4Address = await state.page.evaluate(() => {
return sessionStorage.getItem("publicIpv4Address");
});
state.csrfToken = await state.page.evaluate(() => {
return sessionStorage.getItem("csrf-token");
});
//I NEED TO CLICK HERE! CAN BE WHITESPACE, LINK, IMAGE
state.browser_cookies = await state.page.cookies();
state.browser.close();
for (const cookie of state.browser_cookies) {
if(cookie.name === "dtPC") {
state.dtpc = cookie.value;
}
await state.jar.setCookie(
`${cookie.name}=${cookie.value}`,
'https://www.website.com'
)
}
return state;
} catch(error) {
if(state.browser) {
state.browser.close();
}
throw new Error(error);
}
};
The simplest way I can think of out of my head to choose a random element from DOM would be probably something like using querySelectorAll() which will return you an array of all <div>s in your document (or choose any other element, like <p> or anything else), then you can easily use click() on random one from the result, for example:
await page.evaluate(() => {
const allDivs = document.querySelectorAll('.left-sidebar-toggle');
const randomElement = allDivs[Math.floor(Math.random() * allDivs.length)];
randomElement.click();
});

Download files before build in gatsby wordpress

I have a client that im working with who needs his pdfs to be readable in browser and the user doesn't need to download them first and it turned out to not be an option to do it through Wordpress so I thought I can download them in gatsby before build everytime if they don't already exist and I was wondering if this is possible.
I found this repo: https://github.com/jamstack-cms/jamstack-ecommerce
that shows a way to do it with this code:
function getImageKey(url) {
const split = url.split('/')
const key = split[split.length - 1]
const keyItems = key.split('?')
const imageKey = keyItems[0]
return imageKey
}
function getPathName(url, pathName = 'downloads') {
let reqPath = path.join(__dirname, '..')
let key = getImageKey(url)
key = key.replace(/%/g, "")
const rawPath = `${reqPath}/public/${pathName}/${key}`
return rawPath
}
async function downloadImage (url) {
return new Promise(async (resolve, reject) => {
const path = getPathName(url)
const writer = fs.createWriteStream(path)
const response = await axios({
url,
method: 'GET',
responseType: 'stream'
})
response.data.pipe(writer)
writer.on('finish', resolve)
writer.on('error', reject)
})
}
but It doesn't seem to work if i put it in my createPages and i cant use it outside it either because i don't have access to graphql to query the data first.
any idea how to do this?
WordPress source example is defined as async:
exports.createPages = async ({ graphql, actions }) => {
... so you can already use await to download your file(-s) just after querying data (and before createQuery() call). It should (NOT TESTED) be as easy as:
// Check for any errors
if (result.errors) {
console.error(result.errors)
}
// Access query results via object destructuring
const { allWordpressPage, allWordpressPost } = result.data
const pageTemplate = path.resolve(`./src/templates/page.js`)
allWordpressPage.edges.forEach(edge => {
// for one file per edge
// url taken/constructed from some edge property
await downloadImage (url);
createPage({
Of course for multiple files you should use Promise.all to wait for [resolving] all [returned promise] downloads before creating page:
allWordpressPage.edges.forEach(edge => {
// for multiple files per edge(page)
// url taken/constructed from some edge properties in a loop
// adapth 'paths' of iterable (edge.xxx.yyy...)
// and/or downloadImage(image) argument, f.e. 'image.someUrl'
await Promise.all(
edge.node.someImageArrayNode.map( image => { return downloadImage(image); }
);
createPage({
If you need to pass/update image nodes (for components usage) you should be able to mutate nodes, f.e.:
await Promise.all(
edge.node.someImageArrayNode.map( image => {
image["fullUrl"] = `/publicPath/${image.url}`;
return downloadImage(image.url); // return Promise at the end
}
);
createPage({
path: slugify(item.name),
component: ItemView,
context: {
content: item,
title: item.name,
firstImageUrl: edge.node.someImageArrayNode[0].fullUrl,
images: edge.node.someImageArrayNode

Use headless chrome to intercept image request data

I have a use case that needs to use Headless Chrome Network (https://chromedevtools.github.io/devtools-protocol/tot/Network/) to intercept all images requests and find out the image size before saving it (basically discard small images such as icons).
However, I am unable to figure out a way to load the image data in memory before saving it. I need to load it in Img object to get width and height. The Network.getResponseBody is taking requestId which I don't have access in Network.requestIntercepted. Also Network.loadingFinished always gives me "0" in encodedDataLength variable. I have no idea why. So my questions are:
How to intercept all responses from jpg/png request and get the image data? Without saving the file via URL string to the disk and load back.
BEST: how to get image dimension from header response? Then I don't have to read the data into memory at all.
My code is below:
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');
(async function() {
async function launchChrome() {
return await chromeLauncher.launch({
chromeFlags: [
'--disable-gpu',
'--headless'
]
});
}
const chrome = await launchChrome();
const protocol = await CDP({
port: chrome.port
});
const {
DOM,
Network,
Page,
Emulation,
Runtime
} = protocol;
await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
await Network.setRequestInterceptionEnabled({enabled: true});
Network.requestIntercepted(({interceptionId, request, resourceType}) => {
if ((request.url.indexOf('.jpg') >= 0) || (request.url.indexOf('.png') >= 0)) {
console.log(JSON.stringify(request));
console.log(resourceType);
if (request.url.indexOf("/unspecified.jpg") >= 0) {
console.log("FOUND unspecified.jpg");
console.log(JSON.stringify(interceptionId));
// console.log(JSON.stringify(Network.getResponseBody(interceptionId)));
}
}
Network.continueInterceptedRequest({interceptionId});
});
Network.loadingFinished(({requestId, timestamp, encodedDataLength}) => {
console.log(requestId);
console.log(timestamp);
console.log(encodedDataLength);
});
Page.navigate({
url: 'https://www.yahoo.com/'
});
Page.loadEventFired(async() => {
protocol.close();
chrome.kill();
});
})();
This should get you 90% of the way there. It gets the body of each image request. You'd still need to base64decode, check size and save etc...
const CDP = require('chrome-remote-interface');
const sizeThreshold = 1024;
async function run() {
try {
var client = await CDP();
const { Network, Page } = client;
// enable events
await Promise.all([Network.enable(), Page.enable()]);
// commands
const _url = "https://google.co.za";
let _pics = [];
Network.responseReceived(async ({requestId, response}) => {
let url = response ? response.url : null;
if ((url.indexOf('.jpg') >= 0) || (url.indexOf('.png') >= 0)) {
const {body, base64Encoded} = await Network.getResponseBody({ requestId }); // throws promise error returning null/undefined so can't destructure. Must be different in inspect shell to app?
_pics.push({ url, body, base64Encoded });
console.log(url, body, base64Encoded);
}
});
await Page.navigate({ url: _url });
await sleep(5000);
// TODO: process _pics - base64Encoded, check body.length > sizeThreshold, save etc...
} catch (err) {
if (err.message && err.message === "No inspectable targets") {
console.error("Either chrome isn't running or you already have another app connected to chrome - e.g. `chrome-remote-interface inspect`")
} else {
console.error(err);
}
} finally {
if (client) {
await client.close();
}
}
}
function sleep(miliseconds = 1000) {
if (miliseconds == 0)
return Promise.resolve();
return new Promise(resolve => setTimeout(() => resolve(), miliseconds))
}
run();

Resources