Scrape and store Shopify ecommerce websites using Node.js - node.js

I wrote a code to scrape an array of Shopify ecommerce websites using website-scraper npm module in node.js but it is showing 403 error but the same code is working for other websites.
How can we get around this problem?
My scraperTest.js file is :
var scrape = require('website-scraper');
let test = require('./test')
let urls = [];
urlList = ['1500.academy'];
urlList.forEach(url =>{
test.checkRedirect(url)
.then(domain =>{
urls.push('https://' + domain);
console.log(urls);
var options = {
urls: urls,
directory: './autochat/',
'User-Agent': 'request',
};
// with promise
scrape(options).then((result) => {
/* some code here */
}).catch((err) => {
/* some code here */
});
// or with callback
scrape(options, (error, result) => {
/* some code here */
});
})
})
and test.js file is
const request = require('request');
const extractDomain = require('extract-domain');
//var link = 'oneplustwocase.com';
function checkRedirect(link) {
return new Promise((resolve, reject) => {
var url = "http://" + link;
var options = {
url: url,
headers: {
'User-Agent': 'request'
}
};
request(options, function (error, response, body) {
let redirectedDomain = extractDomain(response.request.uri.href);
if(response !== undefined){
extractDomain(response.request.uri.href);
if (response.statusCode === 200 && link !== redirectedDomain) {
resolve(redirectedDomain);
} else {
resolve(link);
}
} else {
resolve(link);
}
});
});
}
module.exports.checkRedirect = checkRedirect;

I got the solution.
We are able to fetch the html data of the domain using request();
The response.body contains the html data
the solution I got by using the following code :
const request = require('request');
const extractDomain = require('extract-domain');
let fs = require('fs');
function checkRedirect(link) {
var url = "http://" + link;
var options = {
url: url,
headers: {
'User-Agent': 'request'
}
};
request(options, function (error, response, body) {
if(response !== undefined){
let redirectedDomain = extractDomain(response.request.uri.href);
let writeStream = fs.createWriteStream(redirectedDomain + '.html');
writeStream.write(response.body)
writeStream.end();
});
}
module.exports.checkRedirect = checkRedirect;
//checkRedirect('oneplustwocase.com')
/*
var r = request(url, function (e, resp) {
r.uri
resp.request.uri
})*/

Since you are interested in data, save yourself the headache of scraping and simply download the site XML file. It contains all the products and interesting information, just like Google or any other search engine.

It seems that website http://1500.academy returns 403 if it doesn't like user-agent header. I suggest to try user-agent which looks like browser
According to website-scraper documentation https://www.npmjs.com/package/website-scraper#request you should pass headers for request in request property, not on root level
So options should be like:
const options = {
urls:[{url: 'http://1500.academy/'}],
directory: './autochat/',
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
}
}
By the way website-scraper follows redirects by default, so you can skip checking redirects

Related

Get the list of repositories of users from GitHub

Im trying to achieve the following things in application created from scratch using nodeJs.
Read the list of users from a file in my solution.
Get all the public repositories of those users.
Below is my code
const express = require("express");
const app = express();
const request = require('request');
const fs = require("fs");
var path = require("path");
var rootDir = process.argv.length > 2 ? process.argv[2] : process.cwd();
var filePath = path.join(rootDir, "userList.txt");
const https = require('https');
app.listen(3002, function () {
console.log("Server running on port 3002...");
});
app.get("/getUserRepository", function (req, res, next) {
fs.readFile("myFilePath/myFile.txt", {encoding: "UTF8"}, function (err, userListObject) {
getDataObject(userListObject);
});
});
function getDataObject(userList) {
var userRepoData = [];
var userListArray = userList.split(",");
userListArray.forEach(function (userListObject) {
https.request("https://api.github.com/users/" + userListObject + "/repos", function (res) {
res.setEncoding('utf8');
res.on('data', function (data) {
userRepoData.push(JSON.parse(data));
});
}).end();
});
}
The challenge im facing is, when im making a separate call to get the repo of each user, im getting exception
"Request forbidden by administrative rules. Please make sure your request has a User-Agent header (http://developer.github.com/v3/#user-agent-required)."
Im not finding any example / approach as to where i can add the user-agent.
Also one more thing that i want to know is, where this is the best approach to achieve what i want.
Try this
userListArray.forEach(function (userListObject) {
var options = {
url: "https://api.github.com/users/" + userListObject + "/repos",
headers: {
'User-Agent': 'my node app'
}
};
https.request(options, function (res) {
res.setEncoding('utf8');
res.on('data', function (data) {
userRepoData.push(JSON.parse(data));
});
}).end();
});
As the error says you are missing a header User-Agent. Refactor your code like this
function getDataObject(userList) {
let userRepoData = [];
let userListArray = userList.split(",");
let options = {
host: 'api.github.com',
path: '/users/' + username + '/repos',
method: 'GET',
headers: {'user-agent': 'node.js'}
};
userListArray.forEach(function (userListObject) {
https.request(options, function (res) {
res.setEncoding('utf8');
res.on('data', function (data) {
userRepoData.push(JSON.parse(data));
});
}).end();
});
}
You can read more about User-Agent here.
Basically you need to pass how are you trying to access github api. If you try with browser it automatically sends the user-agent, similar with postman but once writing a script you need to manually pass the user-agent

nodejs http request using async/await syntax [duplicate]

In my program I make async call for my function from another API module:
var info = await api.MyRequest(value);
Module code:
var request = require("request")
module.exports.MyRequest = async function MyRequest(value) {
var options = {
uri: "http://some_url",
method: "GET",
qs: { // Query string like ?key=value&...
key : value
},
json: true
}
try {
var result = await request(options);
return result;
} catch (err) {
console.error(err);
}
}
Execution returns immediately, however result and therefore info contains request object and request body - info.body like key=value&..., not required response body.
What I'm doing wrong? How to fix? What is proper request usage with async, or it only works correctly with promises like mentioned here: Why await is not working for node request module? Following article mentioned it is possible: Mastering Async Await in Node.js.
You need to use the request-promise module, not the request module or http.request().
await works on functions that return a promise, not on functions that return the request object and expect you to use callbacks or event listeners to know when things are done.
The request-promise module supports the same features as the request module, but asynchronous functions in it return promises so you can use either .then() or await with them rather than the callbacks that the request module expects.
So, install the request-promise module and then change this:
var request = require("request");
to this:
const request = require("request-promise");
Then, you can do:
var result = await request(options);
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one.
I have been using got() myself and it's built from the beginning to use promises, supports many of the same options as the request() module and is simple to program.
Pretty sure you can also do the following. If what you need does not return Promise by default you can provide it via new Promise method. Above answer is less verbose though.
async function getBody(url) {
const options = {
url: url,
method: 'GET',
};
// Return new promise
return new Promise(function(resolve, reject) {
// Do async job
request.get(options, function(err, resp, body) {
if (err) {
reject(err);
} else {
resolve(body);
}
})
})
}
I just managed to get it to work with async/await. I wrapped it inside a function promisifiedRequest to return a promise that runs the request callback and resolves or rejects it based on error and response.
const request = require('request');
const promisifiedRequest = function(options) {
return new Promise((resolve,reject) => {
request(options, (error, response, body) => {
if (response) {
return resolve(response);
}
if (error) {
return reject(error);
}
});
});
};
(async function() {
const options = {
url: 'https://www.google.com',
method: 'GET',
gzip: true,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
}
};
let response = await promisifiedRequest(options);
console.log(response.headers);
console.log(response.body);
})();
Since request-promise has been deprecated, here are other options that don't depend on the NPM request package. got has been mentioned already, but it depends on 11 other packages. axios, in contrast, only has 1 dependency (for redirects). Everything else is natively implemented and built on top of the native NodeJS packages.
Here is the same example using axios:
const axios = require('axios')
const response = await axios.get(url)
const result = response.data
or, as a one-liner in JavaScript
const result = (await axios.get(url)).data
One-liner in TypeScript:
const {data} = await axios.get(url)
For simple cases where you don't need advanced features like cookies, following redirects and retrying, you can use native http/https module to make requests:
const https = require('https')
async function fetch(url) {
return new Promise((resolve, reject) => {
const request = https.get(url, { timeout: 1000 }, (res) => {
if (res.statusCode < 200 || res.statusCode > 299) {
return reject(new Error(`HTTP status code ${res.statusCode}`))
}
const body = []
res.on('data', (chunk) => body.push(chunk))
res.on('end', () => {
const resString = Buffer.concat(body).toString()
resolve(resString)
})
})
request.on('error', (err) => {
reject(err)
})
request.on('timeout', () => {
request.destroy()
reject(new Error('timed out'))
})
})
}
const res = await fetch('https://...')

Proper request with async/await in Node.JS

In my program I make async call for my function from another API module:
var info = await api.MyRequest(value);
Module code:
var request = require("request")
module.exports.MyRequest = async function MyRequest(value) {
var options = {
uri: "http://some_url",
method: "GET",
qs: { // Query string like ?key=value&...
key : value
},
json: true
}
try {
var result = await request(options);
return result;
} catch (err) {
console.error(err);
}
}
Execution returns immediately, however result and therefore info contains request object and request body - info.body like key=value&..., not required response body.
What I'm doing wrong? How to fix? What is proper request usage with async, or it only works correctly with promises like mentioned here: Why await is not working for node request module? Following article mentioned it is possible: Mastering Async Await in Node.js.
You need to use the request-promise module, not the request module or http.request().
await works on functions that return a promise, not on functions that return the request object and expect you to use callbacks or event listeners to know when things are done.
The request-promise module supports the same features as the request module, but asynchronous functions in it return promises so you can use either .then() or await with them rather than the callbacks that the request module expects.
So, install the request-promise module and then change this:
var request = require("request");
to this:
const request = require("request-promise");
Then, you can do:
var result = await request(options);
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one.
I have been using got() myself and it's built from the beginning to use promises, supports many of the same options as the request() module and is simple to program.
Pretty sure you can also do the following. If what you need does not return Promise by default you can provide it via new Promise method. Above answer is less verbose though.
async function getBody(url) {
const options = {
url: url,
method: 'GET',
};
// Return new promise
return new Promise(function(resolve, reject) {
// Do async job
request.get(options, function(err, resp, body) {
if (err) {
reject(err);
} else {
resolve(body);
}
})
})
}
I just managed to get it to work with async/await. I wrapped it inside a function promisifiedRequest to return a promise that runs the request callback and resolves or rejects it based on error and response.
const request = require('request');
const promisifiedRequest = function(options) {
return new Promise((resolve,reject) => {
request(options, (error, response, body) => {
if (response) {
return resolve(response);
}
if (error) {
return reject(error);
}
});
});
};
(async function() {
const options = {
url: 'https://www.google.com',
method: 'GET',
gzip: true,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
}
};
let response = await promisifiedRequest(options);
console.log(response.headers);
console.log(response.body);
})();
Since request-promise has been deprecated, here are other options that don't depend on the NPM request package. got has been mentioned already, but it depends on 11 other packages. axios, in contrast, only has 1 dependency (for redirects). Everything else is natively implemented and built on top of the native NodeJS packages.
Here is the same example using axios:
const axios = require('axios')
const response = await axios.get(url)
const result = response.data
or, as a one-liner in JavaScript
const result = (await axios.get(url)).data
One-liner in TypeScript:
const {data} = await axios.get(url)
For simple cases where you don't need advanced features like cookies, following redirects and retrying, you can use native http/https module to make requests:
const https = require('https')
async function fetch(url) {
return new Promise((resolve, reject) => {
const request = https.get(url, { timeout: 1000 }, (res) => {
if (res.statusCode < 200 || res.statusCode > 299) {
return reject(new Error(`HTTP status code ${res.statusCode}`))
}
const body = []
res.on('data', (chunk) => body.push(chunk))
res.on('end', () => {
const resString = Buffer.concat(body).toString()
resolve(resString)
})
})
request.on('error', (err) => {
reject(err)
})
request.on('timeout', () => {
request.destroy()
reject(new Error('timed out'))
})
})
}
const res = await fetch('https://...')

Handling redirecting request in NodeJs

I am trying to get the data from the url: autotrader_url
Unfortunately I am not able to handle the redirect.
Here is my code so far.
var request = require('request');
var cheerio = require('cheerio');
request({followAllRedirects: true,url:"http://www.autotrader.com/cars-for-sale/showcase.xhtml?zip=94536&endYear=2017&Log=0&modelCode1=LEGACY&sortBy=derivedpriceDESC&startYear=1981&makeCode1=SUB&numRecords=25&searchRadius=25&mmt=%5BSUB%5BLEGACY%5B%5D%5D%5B%5D%5D&makeCodes=SUB"},
function (error, response, html) {
console.log(response)
if (!error && response.statusCode == 200) {
console.log("yo");
var $ = cheerio.load(html);
console.log($("title").text());
$('div.listing-title h2').each(function(i, element){
var a = $(this);
console.log(a.innerHTML);
});
}
});
What am i missing?
The followAllRedirects: true option will follow http redirects sent back from the server. It appears that they are not using an http redirect because when you visit that page in the browser it loads a page that says "We're searching for the car that you want" and that page does a client side redirect in the browser using javascript. To follow that kind of redirect you probably have to use something like phatomjs.
Alternatively using cheerio(maybe) or some combination of regex you could get the redirect url directly from the source and then make a second request to that url on your own once you have the right url.
<script type="text/javascript">
$(function() {
atc.create(atc.showcaseRedirect, {
destinationUrl: '/cars-for-sale/Subaru/Legacy/Fremont+CA-94536?endYear=2017&firstRecord=0&makeCode1=SUB&mmt=%5BSUB%5BLEGACY%5B%5D%5D%5B%5D%5D&modelCode1=LEGACY&searchRadius=25&showcaseOwnerId=68619541&startYear=1981&Log=0',
queryString: '',
....
}).init();
});
</script>
You just have to grab that destinationUrl. Now with all that said, this assumes you aren't breaking any of their terms of use so you should definitely look into that before you move forward.
I'm not sure if its a bug on their end or if they are trying to prevent people from scraping but you need to set a User-Agent header to get them to respond so add this to your request.
Here is a full working example:
var request = require('request');
var cheerio = require('cheerio');
var firstUrl = "http://www.autotrader.com/cars-for-sale/showcase.xhtml?zip=94536&endYear=2017&Log=0&modelCode1=LEGACY&sortBy=derivedpriceDESC&startYear=1981&makeCode1=SUB&numRecords=25&searchRadius=25&mmt=%5BSUB%5BLEGACY%5B%5D%5D%5B%5D%5D&makeCodes=SUB";
makeRequest(firstUrl, function(err, html) {
if(err) {
return console.log('There was a problem');
}
// get "redirect" url from page source
var re = new RegExp("destinationUrl\:[^\,\}]*");
var redirectUrl = 'http://www.autotrader.com' + html.match(re)[0].replace('destinationUrl: ', '').replace('\'', '');
console.log('redirectUrl', redirectUrl);
// make the second request and process the markup with cheerio
makeRequest(redirectUrl, processFinalMarkup);
});
function processFinalMarkup(err, html) {
var $ = cheerio.load(html);
console.log($("title").text());
$('div.listing-title h2').each(function(i, element){
var a = $(this);
console.log(a.innerHTML);
});
}
function makeRequest(url, callback) {
request({
// Their page requires a User-Agent to be set.
headers: {
'User-Agent': 'express'
},
followAllRedirects: true,
url: url
},
function (error, response, html) {
console.log(response.headers, response.statusCode);
if (!error && response.statusCode == 200) {
console.log("yo");
callback(null, html);
}
});
}

Podio API addItem call

I'm trying to implement https://developers.podio.com/doc/items/add-new-item-22362 Podio API addItem call in a nodejs module. Here is the code:
var _makeRequest = function(type, url, params, cb) {
var headers = {};
if(_isAuthenticated) {
headers.Authorization = 'OAuth2 ' + _access_token ;
}
console.log(url,params);
_request({method: type, url: url, json: true, form: params, headers: headers},function (error, response, body) {
if(!error && response.statusCode == 200) {
cb.call(this,body);
} else {
console.log('Error occured while launching a request to Podio: ' + error + '; body: ' + JSON.stringify (body));
}
});
}
exports.addItem = function(app_id, field_values, cb) {
_makeRequest('POST', _baseUrl + "/item/app/" + app_id + '/',{fields: {'title': 'fgdsfgdsf'}},function(response) {
cb.call(this,response);
});
It returns the following error:
{"error_propagate":false,"error_parameters":{},"error_detail":null,"error_description":"No matching operation could be found. No body was given.","error":"not_found"}
Only "title" attribute is required in the app - I checked that in Podio GUI. I also tried to remove trailing slash from the url where I post to, then a similar error occurs, but with the URL not found message in the error description.
I'm going to setup a proxy to catch a raw request, but maybe someone just sees the error in the code?
Any help is appreciated.
Nevermind on this, I found a solution. The thing is that addItem call was my first "real"-API method implementation with JSON parameters in the body. The former calls were authentication and getApp which is GET and doesn't have any parameters.
The problem is that Podio supports POST key-value pairs for authentication, but doesn't support this for all the calls, and I was trying to utilize single _makeRequest() method for all the calls, both auth and real-API ones.
Looks like I need to implement one for auth and one for all API calls.
Anyway, if someone needs a working proof of concept for addItem call on node, here it is (assuming you've got an auth token beforehand)
_request({method: 'POST', url: "https://api.podio.com/item/app/" + app_id + '/', headers: headers, body: JSON.stringify({fields: {'title': 'gdfgdsfgds'}})},function(error, response, body) {
console.log(body);
});
You should set content-type to application/json
send the body as stringfied json.
const getHeaders = async () => {
const headers = {
Accept: 'application/json',
'Content-Type': 'application/json; charset=utf-8',
};
const token = "YOUR APP TOKEN HERE";
headers.Authorization = `Bearer ${token}`;
return headers;
}
const createItem = async (data) => {
const uri = `https://api.podio.com/item/app/${APP_ID}/`;
const payload = {
fields: {
[data.FIELD_ID]: [data.FIELD_VALUE],
},
};
const response = await fetch(uri, {
method: 'POST',
headers: await getHeaders(),
body: JSON.stringify(payload),
});
const newItem = await response.json();
return newItem;
}

Resources