Scrape part of page that is not html - node.js

I want to scrape this site.
I'm using Node.js and Phantom.js with Phantom.
This is my code:
var phantom = require('phantom');
var loadInProgress = false;
var url = '';
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {'Requesting', requestData.url);
await page.on('onConsoleMessage', function(msg) {;
await page.on('onLoadStarted', function() {
loadInProgress = true;
console.log('Load started...');
await page.on('onLoadFinished', function() {
loadInProgress = false;
console.log('Load end');
const status = await;
await console.log('STATUS:', status);
const content = await'content');
await console.log('CONTENT:', content);
// submit
await page.evaluate(function() {
document.getElementById('lblFilteBy').value = 'Country, area or territory'; //'WHO region';
document.getElementById('lblSelectBy').value = 'Italy'; //'European Region of WHO';
document.getElementById('lbl_YearFrom').value = '1995';
document.getElementById('lbl_WeekFrom').value = '1';
document.getElementById('lbl_YearTo').value = '2018';
document.getElementById('ctl_list_WeekTo').value = '53';
//console.log('SUBMIT:', document.getElementById('ctl_ViewReport'));
var result = await page.evaluate(function() {
return document.querySelectorAll('html')[0].outerHTML; // Problem here
await console.log('RESULT:', result);
await instance.exit();
I don't understand what this part (in red) of page is:
It's not HTML, how do I scrape the displayed data?
If I go to 'Network' tab of Chrome dev tools:

You can catch the ajax request, check :
outlined in blue, it's the XHR request that you need to call yourself in your phantom script, and the ajax result outlined in red. In the header tab, you will see the form data sent via POST to the page.

This is going to be hard. Take a look at this: Node.js web browser with JavaScript execution
Basically, you need a lib that simulates a browser with js execution and use that to render the report, then you can parse it.


Problem with picking HTML element with cheerio.js [duplicate]

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "";, function() {
page.includeJs("", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
}, function(){
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
console.log(textContent); /* No Problem Mate */
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
x('', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
const page = await browser.newPage();
const response = await page.goto('');
console.log('Browser open in the background (headless)!');
//await browser.close();
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://'});
// somehow use the tab that is open from 1.js (
await browser.disconnect();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
getBrow: async function(){ try {
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
return this.browser;
}catch (e){
} }
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = ""
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
return page;
module.exports = pageRenderer;
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
return emailbody;
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
get browser() {
return this._browser;
get page() {
return this._page;
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
const {Crawler} = require('./1.js');
(async() => {
let crawler = await;
await crawler.login("");
//access crawler's page
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

How to crawl javascript (vuejs, reactjs) web site with nodejs

I was going to crawl vue js frontend web site when I try to crawl that it doesn't load the content to cheerio.. what i was getting , a blank web page. my code as follows
getSiteContentAsJs = (url) => {
return new Promise((resolve, reject) => {
let j = request.jar();
request.get({url: url, jar: j}, function(err, response, body) {
return resolve({body: null, jar: j, error: err});
return resolve({body: body, jar: j, error: null});
I got my content as follows
const { body, jar, error} = await getSiteContentAsJs(url);
//I passed body to cheerio to get the js object out of the web content
const $ = cheerio.load(body);
but there is nothing rendered. but a blank web page. no content in it.
I found that cheerio doesn't run javascript. since this web site based on vue front end I needed a virtual browser which actually run js and render me the output
so instead of using request I used phantom to render js web pages
const phantom = require('phantom');
const cheerio = require('cheerio');
loadJsSite = async (url) => {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {'Requesting', requestData.url);
const status = await;
const content = await'content');
// console.log(content);
// let $ = cheerio.load(content);
await instance.exit();
return {$: cheerio.load(content), content: content};
now I can get the rendered page like below
const {$, content} = await loadJsSite(url);
// I can query like this
// get the body

screenshotting urls with node - async

Here is my code:
const fs = require('fs');
const screenshot = require('screenshot-stream');
const urlp = require('url');
var urls=[
const stream = screenshot(url, '1024x768', {crop: true});
stream.pipe(fs.createWriteStream(urlp.parse(url).hostname + 'test-1024x768.png'));
It only screenshots the last item in the url. Rhe others are images with zero bytes. I think I need to do the operation asynchronously so it doesn't overwrite the stream each time.
How would I do this?
I want the screenshot to work, but catch errors and not block if a url is not accessible
UPDATE: worked better although this code below still slows my computer down a lot!
const screenshotPromise = require('screenshot-promise');
urls.forEach(function(url) {
const promise = screenshotPromise(url, '1024x768', {crop: true}).then(buf => {
fs.writeFileSync(urlp.parse(url).hostname + 'test-1024x768.png', buf);
promise.then((value) => {
// value is whatever we passed in the resolve(...) function above.
// It doesn't have to be a string, but if it is only a succeed message, it probably will be.
What you failed to add here is the error :
throw er; // Unhandled 'error' event
Error: Couldn't load url:
at emitOne (events.js:96:13)
at LineStream.emit (events.js:188:7)
The issue is that the module screenshot-stream is using PhantomJS, and phantomJS is unable to get to the page that outputs the error.
This error seems related to this issue : and seem to use web fonts (e.g. "BebasNeue-webfont.ttf") which Qt loads as application fonts. Something may be going wrong there.
My suggestion is using Google's Puppeteer that includes a built-in screenshot method :
Code I did in the end that worked:
const puppeteer = require('puppeteer');
const urlp = require('url');
var URL = require('url-parse');
var urls = [
var getLocation = function(href) {
var l = document.createElement("a");
l.href = href;
return l;
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
timeout: 40000
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
var url1 = new URL(url);
try {
await page.goto(`${url}`);
await page.screenshot({
path: 'images/' + url1.hostname + '.png'
} catch (error) {
// await page.close();
// await browser.close();
// process.exit(1);

Scrape information with form submit using Phantom

I want to do web scraping of this site.
I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.
So I tried to use Node.js and Phantom.js with Phantom.
This is my code:
var phantom = require('phantom');
// object of methods
var methods = {};
var loadInProgress = false;
var url = ''; = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {'Requesting', requestData.url);
await page.on('onConsoleMessage', function(msg) {;
await page.on('onLoadStarted', function() {
loadInProgress = true;
console.log('Load started...');
await page.on('onLoadFinished', function() {
loadInProgress = false;
console.log('Load end');
const status = await;
console.log('STATUS:', status);
const content = await'content');
console.log('CONTENT:', content);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').value = 'crID%3a250'; // France
document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
var result = await page.evaluate(function() {
return document.querySelectorAll('html')[0].outerHTML;
console.log('RESULT:', result);
await instance.exit();
module.exports = methods;
(How can they select more countries and more years?)
I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.
So crID%3a250 is id of element:
<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />
timeID%3a79 is id of element:
<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />
varID%3a2 is id of element:
<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />
And then ctl00_main_filters_anchorApplyBottom is id of button element:
<div class="All">
<img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" /> Apply Filters
But what I got is the web page itself (in HTML), not the data that interest me.
So it's as if I had not selected any parameters. Why?
After the advice of #Vaviloff I tried to change the code but without success.
My server-side language is Node.js.
Using Phantom I modified the code like this: = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.log('Requesting', requestData.url);
await page.on('onConsoleMessage', function(msg) {
const status = await;
console.log('\n\nSTATUS:', status);
// submit
await page.evaluate(function() {
var countries = {
'Albania': 'crID%3a8',
'Austria': 'crID%3a40',
'Belgium': 'crID%3a56',
'Bulgaria': 'crID%3a100',
'Croatia': 'crID%3a191',
'Cyprus': 'crID%3a196',
'Denmark': 'crID%3a208',
'Estonia': 'crID%3a233',
'Finland': 'crID%3a246',
'France': 'crID%3a250',
'Germany': 'crID%3a276',
'Greece': 'crID%3a300',
'Iceland': 'crID%3a352',
'Ireland': 'crID%3a372',
'Italy': 'crID%3a380',
'Latvia': 'crID%3a428',
'Netherlands': 'crID%3a528',
'Norway': 'crID%3a578',
'Poland': 'crID%3a616',
'Portugal': 'crID%3a620',
'Romania': 'crID%3a642',
'Slovakia': 'crID%3a703',
'Slovenia': 'crID%3a705',
'Spain': 'crID%3a724',
'Sweden': 'crID%3a752',
'Switzerland': 'crID%3a756',
'United Kingdom': 'crID%3a826'
// 2018 - 1980
var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37'];
// select countries
for(var c in countries) {
document.getElementById(countries[c]).setAttribute('checked', true);
// select years
for(var y in years) {
document.getElementById(years[y]).setAttribute('checked', true);
// select variants
document.getElementById('varID%3a2').setAttribute('checked', true); // medium
// click button
console.log('\nWaiting 1.5 seconds...');
await timeout(1500);
// get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
console.log('\n\nRESULT:', result);
await instance.exit();
function elaborateResult(res) {
var el = document.createElement('html'); // ** ERROR HERE **
el.innerHTML = result;
console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
//var obj = utilFunc.createJsonObjectPop(year, country, population);
There are two errors:
result contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variable result.
assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:
var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]
This is what the elaborateResult(res) function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:
ReferenceError: document is not defined
So I changed my strategy and I tried not to use Phantom but a normal request:
var options = {
uri: ';timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
transform: function(body) {
return cheerio.load(body);
}; = async function(req, res) {
.then(function($) {
console.log('\n\nTHEN: ', $);
.catch(function(err) {
console.log('Error', err.stack());
If I run this code I get:
THEN: function (selector, context, r, opts) {
if (!(this instanceof initialize)) {
return new initialize(selector, context, r, opts);
opts = _.defaults(opts || {}, options);
return, selector, context, r || root, opts);
In this case I have other problems.
I don't know how to build the url.
In the example above I chose Albania (crID% 3a8) and Austria (crID% 3a40) and 2015 as year (timeID% 3a79).
Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095.
I don't know how to select the years or how to select variants or how to change pages.
I feel a bit stupid but I can't get what I want... I'm stuck.
Help would be very welcome!
There are several issues with your script that prevent successful scrape.
To check a checkbox, you don't set its value again (it's already set in HTML!), you set its checked attribute to true:
document.getElementById('crID%3a250').setAttribute("checked", true); // France
The button that submits the form is a hyperlink <a> which doesn't have a submit method, it should be clicked (it even has onClick function in the code)
document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // submit the form
**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.
Next, you may get only the table data, no need to sip through all th HTML:
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
Here's a bit trimmed down version of you script with issues corrected:
var phantom = require('phantom');
var url = '';
// A promise to wait for n of milliseconds
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));
(async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {'Requesting', requestData.url);
await page.on('onConsoleMessage', function(msg) {;
const status = await;
await console.log('STATUS:', status);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').setAttribute("checked", true); // France
document.getElementById('timeID%3a79').setAttribute("checked", true); // 2015
document.getElementById('varID%3a2').setAttribute("checked", true); // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // click submit button
console.log('Waiting 1.5 seconds..');
await timeout(1500);
// Get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
await console.log('RESULT:', result);
await instance.exit();
The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:
You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.
And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request: Page:,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=461
