Invalid magic number of a png file - node.js

I am trying to verify if a link is a valid image with magic number. Most of the images link work fine. But here are set of images on trump's site that does not produce correct magic numbers, though they appear to work fine on browser. Magic number they produce is 3c21444f.
Below is my code, Any help would be appreciated:
var request = require('request');
var magic = {
jpg: 'ffd8ffe0',
jpg1: 'ffd8ffe1',
png: '89504e47',
gif: '47494638'
};
var options = {
method: 'GET',
url: 'https://assets.donaldjtrump.com/gallery/4749/screen_shot_2016-10-30_at_1.39.54_pm.png',
encoding: null // keeps the body as buffer
};
request(options, function (error, response, body) {
if(!error) {
var magicNumberInBody = body.toString('hex', 0, 4);
if (magicNumberInBody == magic.jpg ||
magicNumberInBody == magic.jpg1 ||
magicNumberInBody == magic.png ||
magicNumberInBody == magic.gif) {
console.log('Valid image');
} else {
console.log('Invalid Image', magicNumberInBody);
}
}
});

So apparently it seemed to be issue with cloudflare blocking my requests to image. So I fixed it using UserAgent Headers to request for those images.
var options = {
method: 'GET',
url: 'https://assets.donaldjtrump.com/gallery/4749/screen_shot_2016-10-30_at_1.39.54_pm.png',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
},
encoding: null // keeps the body as buffer
};

Related

NodeJS GET request working in local machine but not inside remote VM

I am trying to create a simple tool which will hit a specific Facebook page and extract some data. I am using NodeJS for this. I am using inbuilt https module to the GET request. The thing in my local machine it is working fine but inside the VM it is not able to get the response, it is failing every time.
I tried with axios also but it is not working either.
Previously I thought that could be firewall issue but I enabled every thing inside the firewall but then also it is not working. Very strange issue I am facing with this.
What can be the possible reason for this?
Is it Facebook blocking my request? No right? I am sending User-Agent header also but it is not working.
My code looks like below :
const options = {
hostname: 'www.facebook.com',
// port: 443,
path: `/${username}`,
method: 'GET',
headers: {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6',
},
};
const request = https.request(options, (response) => {
let body = '';
console.log('response : \n' + response);
response.setEncoding('utf8');
response.on('data', (chunk) => {
body += chunk;
console.log('response on data chunk : \n' + chunk);
});
response.on('end', () => {
console.log('body : \n' + body);
const arrMatches = body.match(rePattern);
if (arrMatches && arrMatches.length > 1) {
resolve(arrMatches[1]);
} else {
reject(new Error('Facebook user not found'));
}
});
});
request.on('error', (err) => {
console.log('request error : \n' + err);
reject(err);
});
request.end();

request nodejs gets unreadable data

I'm trying to scrape the html using library request on node.js. The response code is 200 and the data I get is unreadable. Here my code:
var request = require("request");
const options = {
uri: 'https://www.wikipedia.org',
encoding: 'utf-8',
headers: {
"Accept": "text/html,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"charset": "utf-8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.108 Chrome/78.0.3904.108 Safari/537.36"
}
};
request(options, function(error, response, body) {
console.log(body);
});
As you can see, I sent the request for html and utf-8 but got a large string like f��j���+���x��,�G�Y�l
My node version is v8.10.0 and the request version is 2.88.0.
Is something wrong with the code or I'am missing something??
Any hint to overtake this problem would be appreciate.
Updated Answer:
In response to your latest post:
The reason it is not working for Amazon is because the response is gzipped.. In order to decompress the gzip response, you simply need to add gzip: true to the options object you are using. This will work for both Amazon and Wikipedia:
const request = require('request');
const options = {
uri: "https://www.amazon.com",
gzip: true
}
request(options, function(error, response, body) {
if (error) throw error;
console.log(body);
});
Lastly, if you are wanting to scrape webpages like this, it is probably best to use a web scraping framework, like Puppeteer, since it is built for web scraping.
See here for Puppeteer GitHub.
Original Answer:
Since you are just grabbing the HTML from the main page, you do not have to specify charset, encoding, or Accept-Encoding..
const request = require('request');
const options = {
uri: 'https://www.wikipedia.org',
//encoding: 'utf-8',
headers: {
"Accept": "text/html,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
//"charset": "utf-8",
//"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.108 Chrome/78.0.3904.108 Safari/537.36"
}
};
request(options, function (error, response, body) {
if (error) throw error
console.log(body);
});
To take it a bit further... in this scenario, you don't need to specify headers at all...
const request = require('request');
request('https://www.wikipedia.org', function (error, response, body) {
if (error) throw error
console.log(body);
});
Thanks you the reply, when I used that to the Wikipedia page works properly, but when I use it to scrape another website like the amazon, got the same bad result
const request = require('request');
request('https://www.amazon.com', function (error, response, body) {
if (error) throw error
console.log(body);
});

request callback, can't acces error, response and body

I'm creating an https request, to get some hidden variables on a sign in page. I'm using the node.js package request for this. After calling the request, I'm using a callback to go back to my parse function.
class h {
constructor(username, password){
this.username = username;
this.password = password;
this.secret12 = '';
}
init() {
//Loading H without cookie
request({
uri: "http://example.com",
method: "GET",
jar: jar,
followRedirect: true,
maxRedirects: 10,
timeout: 10000,
//Need to fake useragent, otherwise server will close connection without response.
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
},
this.getHiddenInputs());
}
getHiddenInputs(error, response, body) {
if (!error && response.statusCode === 200) {
//Parsing body of request, to get hidden inputs required to mock legit authentication.
const dom = new JSDOM(body);
this.secret12 = (dom.window.document.querySelector('input[value][type="hidden"][name="secret12"]').value);
}
else {
console.log(error);
console.log(response.statusCode)
}
};
}
const helper = new h("Username", "Password");
helper.init();
console.log(helper);
So after calling request inside init(). I'm using the callback function to run the code that finds the Hidden Input after it has completed the request. I'm following the example from here.
Am I missing something?
You are executing this.getHiddenInputs() instead of passing it to request as a callback, so there is no actual callback given to the request call.
You could pass it like this this.getHiddenInputs.bind(this) or I'd prefer something like this (error, response, body) => this.getHiddenInputs(error, response, body)

Scrape and store Shopify ecommerce websites using Node.js

I wrote a code to scrape an array of Shopify ecommerce websites using website-scraper npm module in node.js but it is showing 403 error but the same code is working for other websites.
How can we get around this problem?
My scraperTest.js file is :
var scrape = require('website-scraper');
let test = require('./test')
let urls = [];
urlList = ['1500.academy'];
urlList.forEach(url =>{
test.checkRedirect(url)
.then(domain =>{
urls.push('https://' + domain);
console.log(urls);
var options = {
urls: urls,
directory: './autochat/',
'User-Agent': 'request',
};
// with promise
scrape(options).then((result) => {
/* some code here */
}).catch((err) => {
/* some code here */
});
// or with callback
scrape(options, (error, result) => {
/* some code here */
});
})
})
and test.js file is
const request = require('request');
const extractDomain = require('extract-domain');
//var link = 'oneplustwocase.com';
function checkRedirect(link) {
return new Promise((resolve, reject) => {
var url = "http://" + link;
var options = {
url: url,
headers: {
'User-Agent': 'request'
}
};
request(options, function (error, response, body) {
let redirectedDomain = extractDomain(response.request.uri.href);
if(response !== undefined){
extractDomain(response.request.uri.href);
if (response.statusCode === 200 && link !== redirectedDomain) {
resolve(redirectedDomain);
} else {
resolve(link);
}
} else {
resolve(link);
}
});
});
}
module.exports.checkRedirect = checkRedirect;
I got the solution.
We are able to fetch the html data of the domain using request();
The response.body contains the html data
the solution I got by using the following code :
const request = require('request');
const extractDomain = require('extract-domain');
let fs = require('fs');
function checkRedirect(link) {
var url = "http://" + link;
var options = {
url: url,
headers: {
'User-Agent': 'request'
}
};
request(options, function (error, response, body) {
if(response !== undefined){
let redirectedDomain = extractDomain(response.request.uri.href);
let writeStream = fs.createWriteStream(redirectedDomain + '.html');
writeStream.write(response.body)
writeStream.end();
});
}
module.exports.checkRedirect = checkRedirect;
//checkRedirect('oneplustwocase.com')
/*
var r = request(url, function (e, resp) {
r.uri
resp.request.uri
})*/
Since you are interested in data, save yourself the headache of scraping and simply download the site XML file. It contains all the products and interesting information, just like Google or any other search engine.
It seems that website http://1500.academy returns 403 if it doesn't like user-agent header. I suggest to try user-agent which looks like browser
According to website-scraper documentation https://www.npmjs.com/package/website-scraper#request you should pass headers for request in request property, not on root level
So options should be like:
const options = {
urls:[{url: 'http://1500.academy/'}],
directory: './autochat/',
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
}
}
By the way website-scraper follows redirects by default, so you can skip checking redirects

How to do a get request and get the same results that a browser would with Nodejs?

I'm trying to do a get request for image search, and I'm not getting the same result that I am in my browser. Is there a way to get the same result using node.js?
Here's the code I'm using:
var keyword = "Photographie"
keyword = keyword.replace(/[^a-zA-Z0-9éàèùâêîôûçëïü]/g, "+")
var httpOptions = { hostname: 'yandex.com',
path: '/images/search?text=' + keyword, //path does not accept spaces or dashes
headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'user-agent': 'Mozilla/5.0'}}
console.log(httpOptions.hostname + httpOptions.path +postTitle)
https.get(httpOptions, (httpResponse) => {
console.log(`STATUS: ${httpResponse.statusCode}`);
httpResponse.setEncoding('utf8');
httpResponse.on('data', (htmlBody) => {
console.log(`BODY: ${htmlBody}`);
});
});
By switching to the request-promise library and using the proper capitalization of the User-Agent header name and an actual user agent string from the Chrome browser, this code works for me:
const rp = require('request-promise');
let keyword = "Photographie"
let options = { url: 'http://yandex.com/images/search?text=' + keyword,
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
};
rp(options).then(response => {
console.log(response);
}).catch(err => {
console.log(err);
});
When I try to run your actual code, I get a 302 redirect and a cookie set. I'm guessing that they are expecting you to follow the redirect and retain the cookie. But, you can apparently just switch to the above code and it appears to work for me. I don't know exactly what makes my code work, but it could be that is has a more recognizable user agent.

Resources