Node.js: How to download a file via an HTTPS/POST request - node.js

I'm trying to download a file from www.borsaistanbul.com
For some file (like the ones under the link=> https://www.borsaistanbul.com/veriler/verileralt/hisse-senetleri-piyasasi-verileri/bulten-verileri ) they've provided the file paths so I was able to download them via https.get(downloadLink) easily.
But for the files under https://www.borsaistanbul.com/veriler/verileralt/hisse-senetleri-piyasasi-verileri/piyasa-verileri they don't provide the paths and the download links.
I'm trying to download the one named "Üye Bazında Seanslık İşlem Sıralaması"(the one on the 2nd row)
I might be wrong but as far as I understand, when you click on the download image next to it, your browser makes a POST request and then it triggers smth on the server side and then server serves the file to you.
I've found the POST request with the help of chromeDeveloper tool and tried to simulate it but it does not seem to work.
Could anyone helps and shows me a way how to download this file ?
Here is a sample code I've tried:
fs = require('fs');
const request = require('request');
/* Create an empty file where we can save data */
let file = fs.createWriteStream(`denemePost.zip`);
/* Using Promises so that we can use the ASYNC AWAIT syntax */
new Promise((resolve, reject) => {
let stream = request.post({
/* Here you should specify the exact link to the file you are trying to download */
uri: 'https://www.borsaistanbul.com/veriler/verileralt/hisse-senetleri-piyasasi-verileri/bulten-verileri',
headers: {
// 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
// 'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8,ro;q=0.7,ru;q=0.6,la;q=0.5,pt;q=0.4,de;q=0.3',
'Accept-Language' : 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length' : '7511',
'Content-Type' : 'application/x-www-form-urlencoded',
'Cookie' : 'ASP.NET_SessionId=vugebk1zob2fw2hgxiftjg1z; cPER=!SmE/fvI1sjF1DqtSzYfA84hhMFmKdR+VmPTaX1WlhB8KHfkS3iP2fO2FK2iyUzwiDyupy85iZItfoeo=; _ga=GA1.2.534681471.1587587675; _gid=GA1.2.113108587.1588205109',
'Host': 'www.borsaistanbul.com',
'Origin' : 'null',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode' : 'navigate',
'Sec-Fetch-Site' : 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
// 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
},
/* GZIP true for most of the websites now, disable it if you don't need it */
gzip: true
})
.pipe(file)
.on('finish', () => {
console.log(`The file is finished downloading.`);
resolve();
})
.on('error', (error) => {
reject(error);
})
})
.catch(error => {
console.log(`Something happened: ${error}`);
});
Any help would be much appreciated,
Thanks in advance

I found a workaround if anyone tries to accomplish a similar thing.
I've downloaded the file with puppeteer libraries.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({headless: false,slowMo: 250});
const page = await browser.newPage();
await page.goto('https://www.borsaistanbul.com/veriler/verileralt/hisse-senetleri-piyasasi-verileri/piyasa-verileri');
page.once('load', () => console.log('Page loaded!'));
await page.waitForSelector('#TextContent_C001_lbtnUyeBazindaGunlukIslemSiralamasi');
await page.click('#TextContent_C001_lbtnUyeBazindaGunlukIslemSiralamasi');
await browser.close();
})();

Related

twitter.com Fetch NodeJS 404 Reply

I'm always getting a 404 reply when querying twitter.com (not the API, the homepage), I've used the exact same query than Edge is using.
const http2 = require('node:http2');
const fs = require('node:fs');
const client = http2.connect('https://twitter.com', {});
client.on('error', (err) => console.error(err));
const req = client.request({
':path': '/',
':method': 'GET',
':authority': `twitter.com`,
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US',
'cache-control': 'no-cache',
dnt: '1',
pragma: 'no-cache',
referer: 'https://twitter.com/',
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Microsoft Edge\";v=\"101\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"sec-gpc": "1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32"
});
req.on('response', (headers, flags) => {
for (const name in headers) {
console.log(`${name}: ${headers[name]}`);
}
});
req.setEncoding('utf8');
let data = '';
req.on('data', (chunk) => { data += chunk; });
req.on('end', () => {
//console.log(`\n${data}`);
client.close();
});
req.end();
Sorry to ask but I'm stuck on this for hours and I simply want to retrieve the homepage of twitter.com (or other websites) but I keep getting this 404 while the browser returns 200.
If I do a simple wget https://twitter.com then it's all good, no 404 message, so it has to do with NodeJS way I'm doing I guess, I do not wish to use a NPM package but simply understand what is the problem here.
Thank you

Web scraping using fetch - promise doesn't resolve

I am trying to fetch a particular website, and I already mimic all the request headers that Chrome sends and I am still getting a pending promise that never resolves.
Here is my current code and headers:
const fetch = require('node-fetch');
(async () => {
console.log('Starting fetch');
const fetchResponse = await fetch('https://www.g2a.com/rocket-league-pc-steam-key-global-i10000003107015', {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Accept-Language': 'en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br'
}
})
console.log('I never see this console.log: ', fetchResponse);
if(fetchResponse.ok){
console.log('ok');
}else {
console.log('not ok');
}
console.log('Leaving...');
})();
This is the console logs I can read:
Starting fetch
This is a pending promise: Promise { <pending> }
not ok
Leaving...
Is there something I can do here? I notice on similar questions that for this specific website, I only need to use Accept-Language header, I already tried that, but still the promise never gets resolved.
Also read on another question that they have security against Node.js requests, maybe I need to use another language?
You'll have a better time using async functions and await instead of then here.
I'm assuming your Node.js doesn't support top-level await, hence the last .then.
const fetch = require("node-fetch");
const headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Accept-Language": "en-US;q=0.7,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
};
async function doFetch(url) {
console.log("Starting fetch");
const fetchResponse = await fetch(url, {
method: "GET",
headers,
});
console.log(fetchResponse);
if (!fetchResponse.ok) {
throw new Error("Response not OK");
}
const data = await fetchResponse.json();
return data;
}
doFetch("https://www.g2a.com/rocket-league-pc-steam-key-global-i10000003107015").then((data) => {
console.log("All done", data);
});

How do I set multiple custom HTTP headers in puppeteer?

I am trying to login using puppeteer at https://kith.com/account/login?return_url=%2Faccount
When I login and solve the captcha with audio, it detects me as a bot, so I am trying to change the request headers to see if that helps but cannot find anything on how to change them.
I found this, but it only shows 1 header:
await page.setRequestInterception(true)
page.on('request', (request) => {
const headers = request.headers();
headers['X-Just-Must-Be-Request-In-All-Requests'] = '1';
request.continue({
headers
});
});
You are able to set multiple HTTP headers with the dedicated puppeteer method: page.setExtraHTTPHeaders as well.
E.g.:
await page.setExtraHTTPHeaders({
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'upgrade-insecure-requests': '1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,en;q=0.8'
})
await page.goto('...')
header is array you can add many as you want
page.on('request', (request) => {
const headers = request.headers();
headers['X-Just-Must-Be-Request-In-All-Requests'] = '1';
headers['foo'] = 'bar';
headers['foo2'] = 'bar2';
request.continue({
headers
});
});

NodeJs request.get() function not working while the url is accessible from the browser

I am using the request npm module.I want to retrieve an image from a url. The request.get(url) function is returning me a '400 Bad Request', whereas the image is accessible from the browser.
The url i am hitting is : http://indiatribune.com/wp-content/uploads/2017/09/health.jpg
You could try to add some headers:
const request = require('request');
request.get({
url: 'http://indiatribune.com/wp-content/uploads/2017/09/health.jpg',
headers: {
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-GB,en;q=0.8,en-US;q=0.6,hu;q=0.4',
'Cache-Control': 'max-age=0',
Connection: 'keep-alive',
Host: 'indiatribune.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
},
}, (err, response, data) => {
console.log(response, data);
});
The User-Agent seems to be enough.
Use download module . It's pretty simple.
const fs = require('fs');
const download = require('download');
download('http://indiatribune.com/wp-content/uploads/2017/09/health.jpg').pipe(fs.createWriteStream('foo.jpg'));

How to do a get request and get the same results that a browser would with Nodejs?

I'm trying to do a get request for image search, and I'm not getting the same result that I am in my browser. Is there a way to get the same result using node.js?
Here's the code I'm using:
var keyword = "Photographie"
keyword = keyword.replace(/[^a-zA-Z0-9éàèùâêîôûçëïü]/g, "+")
var httpOptions = { hostname: 'yandex.com',
path: '/images/search?text=' + keyword, //path does not accept spaces or dashes
headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'user-agent': 'Mozilla/5.0'}}
console.log(httpOptions.hostname + httpOptions.path +postTitle)
https.get(httpOptions, (httpResponse) => {
console.log(`STATUS: ${httpResponse.statusCode}`);
httpResponse.setEncoding('utf8');
httpResponse.on('data', (htmlBody) => {
console.log(`BODY: ${htmlBody}`);
});
});
By switching to the request-promise library and using the proper capitalization of the User-Agent header name and an actual user agent string from the Chrome browser, this code works for me:
const rp = require('request-promise');
let keyword = "Photographie"
let options = { url: 'http://yandex.com/images/search?text=' + keyword,
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
};
rp(options).then(response => {
console.log(response);
}).catch(err => {
console.log(err);
});
When I try to run your actual code, I get a 302 redirect and a cookie set. I'm guessing that they are expecting you to follow the redirect and retain the cookie. But, you can apparently just switch to the above code and it appears to work for me. I don't know exactly what makes my code work, but it could be that is has a more recognizable user agent.

Resources