Nodejs / Cheerio cannot get rid of whitespaces

Nodejs / Cheerio cannot get rid of whitespaces - node.js

i am trying to scrape today's matches on betfair and wanna get:
home team
away team
x odd
draw odd
y odd
problem is i keep getting multiple spaces, i have tried alot and cannot fix it, the problem is not with trim but with the execution flow that causes empty lines
Can somebody tell me what im doing wrong?
My code:
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
var url = 'https://www.betfair.com/sport/football';
var customHeaderRequest = request.defaults({
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
})
customHeaderRequest.get(url, function(err, resp, body){
$ = cheerio.load(body);
links = $('.section-list .section:nth-child(2) .event-list li');
$(links).each(function(i, link){
var home = $(link).find('.event-information div:nth-child(3) div a div span.team-name:nth-child(1)');
var h = home.text();
if(h != null || h!=''){
fs.appendFile('message.txt', h+'\n', function (err) {});
}
});
});

You shouldn't be calling fs.appendFile() in a loop like this and you may need a better test for an empty line than just what you were using. fs.appendFile() is an asynchronous operation and you're essentially calling a whole bunch of fs.appendFile() operations one after another without waiting for the prior ones to finish.
You can either use a stream or you have to wait until the previous fs.appendFile() is done before calling the next one.
And, if you want to make sure you have no blank-looking results, you need a better filter for results that have only whitespace in them (I added .trim() to my code below).
Here's one way to do that:
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
const util = require('util');
const appendFile = util.promisify(fs.appendFile);
var url = 'https://www.betfair.com/sport/football';
var customHeaderRequest = request.defaults({
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
})
customHeaderRequest.get(url, async function(err, resp, body){
try {
let $ = cheerio.load(body);
let links = $('.section-list .section:nth-child(2) .event-list li').toArray();
for (let link of links) {
const home = $(link).find('.event-information div:nth-child(3) div a div span.team-name:nth-child(1)').text().trim();
if (home) {
await appendFile('message.txt', home +'\n');
}
}
} catch(e) {
// error writing to the file, handle that error here
}
});
Other notes: You should also always declare all local variables you are using so they are never allowed to be implicit globals.

Related

I am having issues properly sending headers to a SOAP endpoint

I have a SOAP endpoint I need to gather data from and I have a working prototype using python3 however I would like this to be a node JS project to diversify my portfolio however when sending Headers to the endpoint I have noticed an error (Media Type not supported) so I looked to the headers and noticed something odd with them, some of the keys are in quotes and other not and i believe this may be the source of the Issue, any help would be appreciated,
Headers when request is made
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
'Content-Type': 'text/xml; charset=utf-8',
SOAPAction: 'http://api.neatoscan.com/RequestOrdersReport',
Host: 'desktopbridge.neatoscan.com',
'Content-Length': '486',
Expect: '100-continue',
Connection: 'Keep-Alive'
}
Request Code (All vars in header are pulled from a config file to keep authentication separate from main file)
import {Client_User_Name, Token,start_date_time,end_date_time, SOAP_Header_CONTENT_TYPE,SOAP_Header_Host_Config,USER_AGENT,SOAP_actions,
SOAP_Content_len,SOAP_urls} from './config.js';
import { createRequire } from 'module';
const require = createRequire(import.meta.url);
const axios = require('axios');
const soapRequest = require('easy-soap-request');
var fs = require('fs'),parseString = require("xml2js").parseString,xml2js = require("xml2js");
var Request_Book_Order_report_XML = '/home/malachi/Desktop/Projects/Daily_Reports_Goodwill/NeatoScan/xml_files/Request_Book_Order_Report.xml'
var Report_Status_Books_XML = '/home/malachi/Desktop/Projects/Daily_Reports_Goodwill/NeatoScan/xml_files/Request_Report_Status.xml'
const url = SOAP_urls['Books_Url'];
function req(xml, headers) {
axios.post(url,xml, headers
).then(response => {console.log(response.data)}
).catch(err => {console.log(err)});
}
var SA = 'Book_Report_Request'
var CL = 'Book_Report_Request_Content_Len'
var url_Headers = {
"User-Agent": USER_AGENT,'Content-Type': SOAP_Header_CONTENT_TYPE['Books_Content_Type'],'SOAPAction': SOAP_actions[SA] ,
'Host': SOAP_Header_Host_Config['Books_Host'], 'Content-Length':SOAP_Content_len[CL], 'Expect':'100-continue', 'Connection': 'Keep-Alive'};
//
fs.readFile(Request_Book_Order_report_XML, "utf-8", function(err, data) {
parseString(data, function(err, result) {
var json = result;
var auth_filter = json['soap:Envelope']['soap:Body'][0]['RequestOrdersReport'][0];
var auth_Client_User = auth_filter['username'] = Client_User_Name;
var auth_token = auth_filter['token'] = Token;
var date_start_filter = auth_filter['startDate'] = start_date_time;
var date_end_filter = auth_filter['endDate'] = end_date_time;
var builder = new xml2js.Builder();
var xml = builder.buildObject(json);
console.log(xml);
console.log(url_Headers);
// req(xml, url_Headers)
});
});

Puppeteer scraping attempt always ends in undefined value

Simple code, should work, but it doesn't.
const puppeteer = require ('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch({ headless:false });
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
});
await page.goto(url)
const [el] = await page.$x('/html/body/main/div[1]/div/div/div[2]/h1');
const txt = await el.getProperty('txt')
const srcText = await txt.jsonValue()
console.log(srcText)
}
scrapeProduct('https://getbootstrap.com/')
//Same result on other urls as well.
I've also tried to querySelector instead of xPath, that worked in some cases, it would log the first value of the node as expected, but then querySelectorAll on the same element would again return "undefined". I've looked everywhere, but simply can't find the solution.

i do it this way
const puppeteer = require("puppeteer");
async function scrapeProduct(url) {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
"user-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
});
await page.goto(url);
// wait for elements defined by XPath appear in page
await page.waitForXPath("/html/body/main/div[1]/div/div/div[2]/h1");
// evaluate XPath expression of the target selector (it return array of ElementHandle)
const headings = await page.$x("/html/body/main/div[1]/div/div/div[2]/h1");
// prepare to get the textContent of the selector above (use page.evaluate)
let textContent = await page.evaluate((el) => el.textContent, headings[0]);
console.log(textContent);
}
scrapeProduct('https://getbootstrap.com/')
upvote my answer if it helps !

How to add custom headers in Playwright

headers["user-agent"] = fakeUa();
console.log(fakeUa())
let firstReq = true;
page.route('**/*', route => {
const request = route.request()
//console.log(request.url(), JSON.stringify(request.headers()));
if("x-j3popqvx-a" in request.headers()){
headers = request.headers();
//console.log(headers);
console.log("exiting");
return;
}
else {
console.log("in");
return route.continue({headers: headers});
}
});
let pageRes = await page.goto(url, {waitUntil: 'load', timeout: 0});
I want to add fake user agent when sending request to url. But it doesn't add the fake useragent rather goes with the default one.

While in puppeteer it was possible with the page.setUserAgent() method to apply a custom UA and page.setExtraHTTPHeaders() to set any custom headers, in playwright you can set custom user agent (userAgent) and headers (extraHTTPHeaders) as options of browser.newPage() or browser.newContext() like:
const page = await browser.newPage({ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' })
const page = await browser.newPage({
extraHTTPHeaders: {
'Cache-Control': 'no-cache'
}
})
Edit: In case you are using it with newContext() usage looks like this (make sure to set userAgent in the settings of newContext and not in newPage!):
const context = await browser.newContext({ userAgent: 'hello' })
const page = await context.newPage()
// to check the UA:
console.log(await page.evaluate(() => navigator.userAgent))

If you're using #playwright/test, you can set a user agent as follows:
import {expect, test} from "#playwright/test"; // ^1.30.0
const userAgent =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
test.describe("with user agent", () => {
test.use({userAgent});
test("does stuff", async ({page}) => {
await page.goto("https://example.com/");
await expect(page.locator("h1")).toHaveText("Example Domain");
});
});

How to do a get request and get the same results that a browser would with Nodejs?

I'm trying to do a get request for image search, and I'm not getting the same result that I am in my browser. Is there a way to get the same result using node.js?
Here's the code I'm using:
var keyword = "Photographie"
keyword = keyword.replace(/[^a-zA-Z0-9éàèùâêîôûçëïü]/g, "+")
var httpOptions = { hostname: 'yandex.com',
path: '/images/search?text=' + keyword, //path does not accept spaces or dashes
headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'user-agent': 'Mozilla/5.0'}}
console.log(httpOptions.hostname + httpOptions.path +postTitle)
https.get(httpOptions, (httpResponse) => {
console.log(`STATUS: ${httpResponse.statusCode}`);
httpResponse.setEncoding('utf8');
httpResponse.on('data', (htmlBody) => {
console.log(`BODY: ${htmlBody}`);
});
});

By switching to the request-promise library and using the proper capitalization of the User-Agent header name and an actual user agent string from the Chrome browser, this code works for me:
const rp = require('request-promise');
let keyword = "Photographie"
let options = { url: 'http://yandex.com/images/search?text=' + keyword,
headers: {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
};
rp(options).then(response => {
console.log(response);
}).catch(err => {
console.log(err);
});
When I try to run your actual code, I get a 302 redirect and a cookie set. I'm guessing that they are expecting you to follow the redirect and retain the cookie. But, you can apparently just switch to the above code and it appears to work for me. I don't know exactly what makes my code work, but it could be that is has a more recognizable user agent.

Cant seem to make cookie authenticated requests using Unirest

I'm trying to use the Unirest Jar feature to make session authenticated requests however I cant seem to get my name console logged. The following is sample code Im using. Any ideas on how I can get my name console logged?
var cheerio = require('cheerio');
var unirest = require('unirest');
var CookieJar = unirest.jar();
var twitterLogin = 'https://twitter.com/sessions';
var twitterUsername = 'TWITTERUSERNAME';
var twitterPassword = 'TWITTERPASSWORD!'
var Request = unirest.get('https://twitter.com/login').jar(CookieJar);
Request.end(function(response) {
var $ = cheerio.load(response.body);
var authToken = $('input[name="authenticity_token"]').val();
console.log(authToken)
unirest.post(twitterLogin)
.jar(CookieJar)
.followRedirect(true)
.header('Referer', 'https://twitter.com/login')
.header('content-type', 'application/x-www-form-urlencoded')
.header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36')
.field('authenticity_token', authToken)
.field('session[username_or_email]', twitterUsername)
.field('session[password]', twitterPassword)
.end(function(response) {
console.log(response.statusCode)
console.log(response.body)
unirest.get('https://twitter.com')
.jar(CookieJar)
.followRedirect(true)
.end(function(response) {
var $ = cheerio.load(response.body)
console.log($('.DashboardProfileCard-name').text());
})
})
});

From the docs:
Request.jar(Boolean) or Request.jar(Jar)
Sets jar, cookie container, on Request.options. When set to true it stores cookies for future usage.
You should call unirest.jar() with true as argument:
var CookieJar = unirest.jar(true);

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Nodejs / Cheerio cannot get rid of whitespaces - node.js

Related

I am having issues properly sending headers to a SOAP endpoint

Puppeteer scraping attempt always ends in undefined value

How to add custom headers in Playwright

How to do a get request and get the same results that a browser would with Nodejs?

Cant seem to make cookie authenticated requests using Unirest

Categories

Resources