Scrape information with form submit using Phantom - node.js

I want to do web scraping of this site.
I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.
So I tried to use Node.js and Phantom.js with Phantom.
This is my code:
var phantom = require('phantom');
// object of methods
var methods = {};
var loadInProgress = false;
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';
methods.download = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
await page.on('onLoadStarted', function() {
loadInProgress = true;
console.log('Load started...');
});
await page.on('onLoadFinished', function() {
loadInProgress = false;
console.log('Load end');
});
const status = await page.open(url);
console.log('STATUS:', status);
const content = await page.property('content');
console.log('CONTENT:', content);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').value = 'crID%3a250'; // France
document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
});
var result = await page.evaluate(function() {
return document.querySelectorAll('html')[0].outerHTML;
});
console.log('RESULT:', result);
await instance.exit();
};
module.exports = methods;
(How can they select more countries and more years?)
I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.
So crID%3a250 is id of element:
<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />
timeID%3a79 is id of element:
<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />
varID%3a2 is id of element:
<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />
And then ctl00_main_filters_anchorApplyBottom is id of button element:
<div class="All">
<img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" /> Apply Filters
</div>
But what I got is the web page itself (in HTML), not the data that interest me.
So it's as if I had not selected any parameters. Why?
EDIT 1
After the advice of #Vaviloff I tried to change the code but without success.
My server-side language is Node.js.
Using Phantom I modified the code like this:
methods.download = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.log('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.log(msg);
});
const status = await page.open(url);
console.log('\n\nSTATUS:', status);
// submit
await page.evaluate(function() {
var countries = {
'Albania': 'crID%3a8',
'Austria': 'crID%3a40',
'Belgium': 'crID%3a56',
'Bulgaria': 'crID%3a100',
'Croatia': 'crID%3a191',
'Cyprus': 'crID%3a196',
'Denmark': 'crID%3a208',
'Estonia': 'crID%3a233',
'Finland': 'crID%3a246',
'France': 'crID%3a250',
'Germany': 'crID%3a276',
'Greece': 'crID%3a300',
'Iceland': 'crID%3a352',
'Ireland': 'crID%3a372',
'Italy': 'crID%3a380',
'Latvia': 'crID%3a428',
'Netherlands': 'crID%3a528',
'Norway': 'crID%3a578',
'Poland': 'crID%3a616',
'Portugal': 'crID%3a620',
'Romania': 'crID%3a642',
'Slovakia': 'crID%3a703',
'Slovenia': 'crID%3a705',
'Spain': 'crID%3a724',
'Sweden': 'crID%3a752',
'Switzerland': 'crID%3a756',
'United Kingdom': 'crID%3a826'
};
// 2018 - 1980
var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37'];
// select countries
for(var c in countries) {
document.getElementById(countries[c]).setAttribute('checked', true);
}
// select years
for(var y in years) {
document.getElementById(years[y]).setAttribute('checked', true);
}
// select variants
document.getElementById('varID%3a2').setAttribute('checked', true); // medium
// click button
document.getElementById('ctl00_main_filters_anchorApplyBottom').click();
});
console.log('\nWaiting 1.5 seconds...');
await timeout(1500);
// get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
console.log('\n\nRESULT:', result);
elaborateResult(result);
await instance.exit();
};
function elaborateResult(res) {
var el = document.createElement('html'); // ** ERROR HERE **
el.innerHTML = result;
console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
//var obj = utilFunc.createJsonObjectPop(year, country, population);
//console.log(obj);
}
There are two errors:
result contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variable result.
assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:
var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]
This is what the elaborateResult(res) function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:
ReferenceError: document is not defined
So I changed my strategy and I tried not to use Phantom but a normal request:
var options = {
uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
transform: function(body) {
return cheerio.load(body);
}
};
methods.download = async function(req, res) {
request(options)
.then(function($) {
console.log('\n\nTHEN: ', $);
})
.catch(function(err) {
console.log('Error', err.stack());
});
}
If I run this code I get:
THEN: function (selector, context, r, opts) {
if (!(this instanceof initialize)) {
return new initialize(selector, context, r, opts);
}
opts = _.defaults(opts || {}, options);
return Cheerio.call(this, selector, context, r || root, opts);
}
In this case I have other problems.
I don't know how to build the url.
In the example above I chose Albania (crID% 3a8) and Austria (crID% 3a40) and 2015 as year (timeID% 3a79).
Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095.
I don't know how to select the years or how to select variants or how to change pages.
I feel a bit stupid but I can't get what I want... I'm stuck.
Help would be very welcome!

There are several issues with your script that prevent successful scrape.
To check a checkbox, you don't set its value again (it's already set in HTML!), you set its checked attribute to true:
document.getElementById('crID%3a250').setAttribute("checked", true); // France
The button that submits the form is a hyperlink <a> which doesn't have a submit method, it should be clicked (it even has onClick function in the code)
document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // submit the form
**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.
Next, you may get only the table data, no need to sip through all th HTML:
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
Here's a bit trimmed down version of you script with issues corrected:
var phantom = require('phantom');
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';
// A promise to wait for n of milliseconds
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));
(async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
const status = await page.open(url);
await console.log('STATUS:', status);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').setAttribute("checked", true); // France
document.getElementById('timeID%3a79').setAttribute("checked", true); // 2015
document.getElementById('varID%3a2').setAttribute("checked", true); // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // click submit button
});
console.log('Waiting 1.5 seconds..');
await timeout(1500);
// Get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
await console.log('RESULT:', result);
await instance.exit();
})();
The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:
You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.
Update
And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request: Page:
data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=variableID:12&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=461

Related

Problem with picking HTML element with cheerio.js [duplicate]

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

Click anywhere on page using Puppeteer

Currently I'm using Puppeteer to fetch cookies & headers from a page, however it's using a bot prevention system which is only bypassed when clicking on the page; I don't want to keep this sequential so it's "detectable"
How can I have my Puppeteer click anywhere on the page at random? regardless of wether it clicks a link, button etc..
I've currently got this code
const getCookies = async (state) => {
try {
state.browser = await launch_browser(state);
state.context = await state.browser.createIncognitoBrowserContext();
state.page = await state.context.newPage();
await state.page.authenticate({
username: proxies.username(),
password: proxies.password(),
});
await state.page.setViewport(functions.get_viewport());
state.page.on('response', response => handle_response(response, state));
await state.page.goto('https://www.website.com', {
waitUntil: 'networkidle0',
});
await state.page.waitFor('.unlockLink a', {
timeout: 5000
});
await state.page.click('.unlockLink a');
await state.page.waitFor('input[id="nondevice"]', {
timeout: 5000
});
state.publicIpv4Address = await state.page.evaluate(() => {
return sessionStorage.getItem("publicIpv4Address");
});
state.csrfToken = await state.page.evaluate(() => {
return sessionStorage.getItem("csrf-token");
});
//I NEED TO CLICK HERE! CAN BE WHITESPACE, LINK, IMAGE
state.browser_cookies = await state.page.cookies();
state.browser.close();
for (const cookie of state.browser_cookies) {
if(cookie.name === "dtPC") {
state.dtpc = cookie.value;
}
await state.jar.setCookie(
`${cookie.name}=${cookie.value}`,
'https://www.website.com'
)
}
return state;
} catch(error) {
if(state.browser) {
state.browser.close();
}
throw new Error(error);
}
};
The simplest way I can think of out of my head to choose a random element from DOM would be probably something like using querySelectorAll() which will return you an array of all <div>s in your document (or choose any other element, like <p> or anything else), then you can easily use click() on random one from the result, for example:
await page.evaluate(() => {
const allDivs = document.querySelectorAll('.left-sidebar-toggle');
const randomElement = allDivs[Math.floor(Math.random() * allDivs.length)];
randomElement.click();
});

Scrape part of page that is not html

I want to scrape this site.
I'm using Node.js and Phantom.js with Phantom.
This is my code:
var phantom = require('phantom');
var loadInProgress = false;
var url = 'http://apps.who.int/flumart/Default?ReportNo=12';
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
await page.on('onLoadStarted', function() {
loadInProgress = true;
console.log('Load started...');
});
await page.on('onLoadFinished', function() {
loadInProgress = false;
console.log('Load end');
});
const status = await page.open(url);
await console.log('STATUS:', status);
const content = await page.property('content');
await console.log('CONTENT:', content);
// submit
await page.evaluate(function() {
document.getElementById('lblFilteBy').value = 'Country, area or territory'; //'WHO region';
document.getElementById('lblSelectBy').value = 'Italy'; //'European Region of WHO';
document.getElementById('lbl_YearFrom').value = '1995';
document.getElementById('lbl_WeekFrom').value = '1';
document.getElementById('lbl_YearTo').value = '2018';
document.getElementById('ctl_list_WeekTo').value = '53';
//console.log('SUBMIT:', document.getElementById('ctl_ViewReport'));
document.getElementById('ctl_ViewReport').submit();
});
var result = await page.evaluate(function() {
return document.querySelectorAll('html')[0].outerHTML; // Problem here
});
await console.log('RESULT:', result);
await instance.exit();
}());
I don't understand what this part (in red) of page is:
It's not HTML, how do I scrape the displayed data?
Thanks!
EDIT 1
If I go to 'Network' tab of Chrome dev tools:
You can catch the ajax request, check :
outlined in blue, it's the XHR request that you need to call yourself in your phantom script, and the ajax result outlined in red. In the header tab, you will see the form data sent via POST to the page.
This is going to be hard. Take a look at this: Node.js web browser with JavaScript execution
Basically, you need a lib that simulates a browser with js execution and use that to render the report, then you can parse it.

Use headless chrome to intercept image request data

I have a use case that needs to use Headless Chrome Network (https://chromedevtools.github.io/devtools-protocol/tot/Network/) to intercept all images requests and find out the image size before saving it (basically discard small images such as icons).
However, I am unable to figure out a way to load the image data in memory before saving it. I need to load it in Img object to get width and height. The Network.getResponseBody is taking requestId which I don't have access in Network.requestIntercepted. Also Network.loadingFinished always gives me "0" in encodedDataLength variable. I have no idea why. So my questions are:
How to intercept all responses from jpg/png request and get the image data? Without saving the file via URL string to the disk and load back.
BEST: how to get image dimension from header response? Then I don't have to read the data into memory at all.
My code is below:
const chromeLauncher = require('chrome-launcher');
const CDP = require('chrome-remote-interface');
const file = require('fs');
(async function() {
async function launchChrome() {
return await chromeLauncher.launch({
chromeFlags: [
'--disable-gpu',
'--headless'
]
});
}
const chrome = await launchChrome();
const protocol = await CDP({
port: chrome.port
});
const {
DOM,
Network,
Page,
Emulation,
Runtime
} = protocol;
await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
await Network.setRequestInterceptionEnabled({enabled: true});
Network.requestIntercepted(({interceptionId, request, resourceType}) => {
if ((request.url.indexOf('.jpg') >= 0) || (request.url.indexOf('.png') >= 0)) {
console.log(JSON.stringify(request));
console.log(resourceType);
if (request.url.indexOf("/unspecified.jpg") >= 0) {
console.log("FOUND unspecified.jpg");
console.log(JSON.stringify(interceptionId));
// console.log(JSON.stringify(Network.getResponseBody(interceptionId)));
}
}
Network.continueInterceptedRequest({interceptionId});
});
Network.loadingFinished(({requestId, timestamp, encodedDataLength}) => {
console.log(requestId);
console.log(timestamp);
console.log(encodedDataLength);
});
Page.navigate({
url: 'https://www.yahoo.com/'
});
Page.loadEventFired(async() => {
protocol.close();
chrome.kill();
});
})();
This should get you 90% of the way there. It gets the body of each image request. You'd still need to base64decode, check size and save etc...
const CDP = require('chrome-remote-interface');
const sizeThreshold = 1024;
async function run() {
try {
var client = await CDP();
const { Network, Page } = client;
// enable events
await Promise.all([Network.enable(), Page.enable()]);
// commands
const _url = "https://google.co.za";
let _pics = [];
Network.responseReceived(async ({requestId, response}) => {
let url = response ? response.url : null;
if ((url.indexOf('.jpg') >= 0) || (url.indexOf('.png') >= 0)) {
const {body, base64Encoded} = await Network.getResponseBody({ requestId }); // throws promise error returning null/undefined so can't destructure. Must be different in inspect shell to app?
_pics.push({ url, body, base64Encoded });
console.log(url, body, base64Encoded);
}
});
await Page.navigate({ url: _url });
await sleep(5000);
// TODO: process _pics - base64Encoded, check body.length > sizeThreshold, save etc...
} catch (err) {
if (err.message && err.message === "No inspectable targets") {
console.error("Either chrome isn't running or you already have another app connected to chrome - e.g. `chrome-remote-interface inspect`")
} else {
console.error(err);
}
} finally {
if (client) {
await client.close();
}
}
}
function sleep(miliseconds = 1000) {
if (miliseconds == 0)
return Promise.resolve();
return new Promise(resolve => setTimeout(() => resolve(), miliseconds))
}
run();

Headless Chrome rendering full page

The problem with current headless Chrome is that there is no API to render the full page you only get the "window" that you set in CLI parameter.
I am using the chrome-remote-interface module, this is the capture example:
const fs = require('fs');
const CDP = require('chrome-remote-interface');
CDP({ port: 9222 }, client => {
// extract domains
const {Network, Page} = client;
Page.loadEventFired(() => {
const startTime = Date.now();
setTimeout(() => {
Page.captureScreenshot()
.then(v => {
let filename = `screenshot-${Date.now()}`;
fs.writeFileSync(filename + '.png', v.data, 'base64');
console.log(`Image saved as ${filename}.png`);
let imageEnd = Date.now();
console.log('image success in: ' + (+imageEnd - +startTime) + "ms");
client.close();
});
}, 5e3);
});
// enable events then start!
Promise.all([
// Network.enable(),
Page.enable()
]).then(() => {
return Page.navigate({url: 'https://google.com'});
}).catch((err) => {
console.error(`ERROR: ${err.message}`);
client.close();
});
}).on('error', (err) => {
console.error('Cannot connect to remote endpoint:', err);
});
To render the full page, one slower and hack solution would be partial rendering. Set fixed height and scroll through the page and take the screenshots after every X pixels. The problem is that how to drive the scrolling part? Would it be better to inject custom JS or is it doable through the Chrome remote interface?
Have you seen this?
https://medium.com/#dschnr/using-headless-chrome-as-an-automated-screenshot-tool-4b07dffba79a
This bit sound's like it would solve your issue:
// Wait for page load event to take screenshot
Page.loadEventFired(async () => {
// If the `full` CLI option was passed, we need to measure the height of
// the rendered page and use Emulation.setVisibleSize
if (fullPage) {
const {root: {nodeId: documentNodeId}} = await DOM.getDocument();
const {nodeId: bodyNodeId} = await DOM.querySelector({
selector: 'body',
nodeId: documentNodeId,
});
const {model: {height}} = await DOM.getBoxModel({nodeId: bodyNodeId});
await Emulation.setVisibleSize({width: viewportWidth, height: height});
// This forceViewport call ensures that content outside the viewport is
// rendered, otherwise it shows up as grey. Possibly a bug?
await Emulation.forceViewport({x: 0, y: 0, scale: 1});
}
setTimeout(async function() {
const screenshot = await Page.captureScreenshot({format});
const buffer = new Buffer(screenshot.data, 'base64');
file.writeFile('output.png', buffer, 'base64', function(err) {
if (err) {
console.error(err);
} else {
console.log('Screenshot saved');
}
client.close();
});
}, delay);
});
Chrome remote interface supports simulating scrolling gestures using the Input domain.
// scroll down y axis 9000px
Input.synthesizeScrollGesture({x: 500, y: 500, yDistance: -9000});
more info:
https://chromedevtools.github.io/devtools-protocol/tot/Input/
You may also be interested in the Emulation domain. dpd's answer contains a few now removed methods. I believe Emulation.setVisibleSize might work for you.
https://chromedevtools.github.io/devtools-protocol/tot/Emulation/

Resources