Are using cheerio and nodejs to get data from the allegro website to create endpoints in an API that gives back csv data this data will be studied later on as part of a data science project:
https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605
to get the cars information I managed to scrape off the links from the first page each link sends you to the car (item of cars) to see the full information of the car I need to scrape more data from each link how do I do that?
and how to i make the json data shows off as csv instead ?
here the code used :
const url =
"https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605";
//const writeStream = fs.createWriteStream("allegro.csv");
// Write Headers
//writeStream.write(`Price,Link \n`);
function getCars() {
return fetch(`${url}`)
.then((response) => response.text())
.then((body) => {
const cars = [];
const $ = cheerio.load(body);
$("._9c44d_2H7Kt").each(function (i, el) {
const $price = $(el).find("._9c44d_1zemI");
const $link = $(el).find("a");
const $year = $(el).find("dd");
const $make = $(el).find("h2");
const car = {
price: $price.text().replace(/\s\s+/g, ""),
link: $link.attr("href"),
year: $year.first().next().next().text(),
make: $make.text(),
};
cars.push(car);
});
// Write Row to CSV
// writeStream.write(`${price},${link} \n`);
return cars;
});
}
the code used for the nodejs endpoint :
app.get("/scraping/:allegro", (req, res) => {
scraper.getCars(req.param.allegro).then((cars) => {
//console.log(cars);
res.json(cars);
});
The data to get from each link is the following : date added,model,phone number, city,vin
There is a convenient thing about these pages, it's that you can return the data in JSON instead of html by just setting the media type to application/json eg setting the Accept header.
For instance to get the list :
curl "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605&order=dd" \
-H "Accept: application/json"
To get a specific item :
curl "https://allegro.pl/ogloszenie/mercedes-ml320-9341716141" -H "Accept: application/json"
So you don't need to use webscraping tools just parsing JSON. The pagination is done by adding a query param &p=PAGE_NUM which is convenient too
I've made a small example in python that can be easily ported to JS. It request the list of cars, then request the first element :
import requests
import json
import pandas as pd
r = requests.get("https://allegro.pl/kategoria/samochody-osobowe-4029",
headers = {
"Accept": "application/json"
},
params = {
"bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
"order":"dd"
})
data = [{
"name": t["name"],
"url": t["url"],
"price": t["sellingMode"]["advertisement"]["price"]["amount"],
**dict([(j["name"],j["values"][0]) for j in t["parameters"]]),
}
for t in r.json()["pagination bottom"]["collection"]["items"]["promoted"]
]
df = pd.DataFrame(data)
print(df)
print("get data for first element")
r = requests.get(data[0]["url"],
headers = {
"Accept": "application/json"
})
item = r.json()
item_data = {
"phone": item["summary"]["offer"]["contact"]["phones"][0]["number"],
"delivery": item["summary"]["offer"]["delivery"]["summary"][0]["value"]["text"],
"startingAt": item["summary"]["offer"]["publication"]["startingAt"],
"endingAt": item["summary"]["offer"]["publication"]["endingAt"],
**dict([(j["name"], j["values"][0]["valueLabel"]) for j in item["summary"]["offer"]["parametersGroups"]["groups"][0]["parameters"]])
}
print(item_data)
An implementation in nodejs using axios :
const axios = require("axios");
async function process() {
let response = await axios.get('https://allegro.pl/kategoria/samochody-osobowe-4029',{
query: {
"bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
"order":"dd"
},
responseType: "json"
});
let promoted = response.data["pagination bottom"].collection.items.promoted;
list = [];
for (var i = 0; i < promoted.length;i++) {
let item = {
name: promoted[i].name,
url: promoted[i].url,
price: promoted[i].sellingMode.advertisement.price.amount,
};
let params = promoted[i].parameters;
for (var j = 0; j < params.length;j++){
item[params[j].name] = params[j].values[0];
}
list.push(item);
}
console.log(list);
console.log("fetching : " + list[0].url);
response = await axios.get(list[0].url,{
responseType: "json"
});
let entryData = response.data;
let entry = {
phone: entryData.summary.offer.contact.phones[0].number,
delivery: entryData.summary.offer.delivery.summary[0].value.text,
startingAt: entryData.summary.offer.publication.startingAt,
endingAt: entryData.summary.offer.publication.endingAt
};
let parameters = entryData.summary.offer.parametersGroups.groups[0].parameters;
for (var i = 0; i < parameters.length;i++) {
entry[parameters[i].name] = parameters[i].values[0].valueLabel
}
console.log(entry);
}
process();
Related
I am currently attempting to include UTM tags in a webhook that I am sending from a wix form. However, upon sending the webhook, I am either receiving an empty response or receiving "google.co.il" source. I am expecting specifically Google Ads or email sources.
I have added the code that I am using in an attempt to make this function properly. Any assistance or guidance would be greatly appreciated..
export function wixForms1_wixFormSubmitted(event) {
var obj = {};
var i = 0;
for (i=0;i<event.fields.length;i++){
obj[event.fields[i].id] = event.fields[i].fieldValue;
}
//console.log(obj);
fetch( "https://hook.eu1.make.com/62tsltjotop6iwboufggguuxhhqrjaen", {
"method": "post",
"headers": {
"Content-Type": "application/json"
},
"body": JSON.stringify(obj)
} )
.then( (httpResponse) => {
if (httpResponse.ok) {
return httpResponse.json();
} else {
return Promise.reject("Fetch did not succeed");
}
} )
.then( (json) => console.log(json) )
.catch(err => console.log(err));
}
$w.onReady(function () {
const query = wixLocation.query;
if (query) {
$w('#refbox').value = JSON.stringify(query);
}
let params = wixLocation.query
console.log(params)
$w('#campaign').value = params.campaign_name
$w('#source').value = params.utm_source
});
import wixWindow from 'wix-window';
import wixLocation from 'wix-location';
});
import wixWindow from 'wix-window';
import wixLocation from 'wix-location';
import {local, session, memory} from 'wix-storage';
let referrer = wixWindow.referrer;
let previousPageURL;
$w.onReady(function () {
previousPageURL = session.getItem("page");
session.setItem("page", wixLocation.url);
});
$w.onReady(function () {
// Write your JavaScript here
const prvurl = wixLocation.url;
if (prvurl) {
$w('#prvurl').value = JSON.stringify(prvurl);
}
// To select an element by ID use: $w("#elementID")
// Click "Preview" to run your code
const query = wixWindow.referrer;
if (query) {
$w('#refbox').value = JSON.stringify(query);
}
}
);
An API request like this: const response = await this.publicGetMarkets (params); is giving me a response that contains a list of markets in the following format:
{
"markets": {
"LINK-USD": {
"market": "LINK-USD",
"status": "ONLINE"
},
...
}
As in the example here, my problem is that LINK-USD is changing for every market.
How do I fix my code so that I can variables such as market, status in my code.
I have written the following code snippet:
const market = this.safeValue (response, 'markets');
const result = [];
for (let i = 0; i < markets.length; i++) {
const markets = this.safeString (markets, {}, {});
const market = this.safeString (markets, 'market');
const status = this.safeString (markets, 'status');
result.push({
'market': market,
'status': status,
});
}
return result;
You can get an array of all the inner objects using Object.values(data.markets).
If you need to filter out unwanted properties that is a fairly simple mapping addition to this also
const data = {
"markets": {
"LINK-USD": {
"market": "LINK-USD",
"status": "ONLINE"
},
"LINK-EURO": {
"market": "LINK-EURO",
"status": "TBD"
}
}
}
const res = Object.values(data.markets)
console.log(res)
const responses = this.safeValue (response, 'markets');
const result = [];
for (let response of responses) {
const market = responses.markets["LINK-USD"].market,
status = responses.markets["LINK-USD"].status;
result.push({market, status});
}
return result;
I hope this is what you asked for.
I'm working with some API server that communicates by XML.
I need to send, let's say: 20 identical POST requests.
I'm writing this in Node JS.
Easy.
BUT - since I'm going to multiply the process, and I want to avoid flooding the server (and getting kicked), I need to break the sending loop IF the (XML) response contains a specific text (a success signal): <code>555</code>, or actually just '555' (the text is wrapped with other XML phrases).
I tried to break the loop based on the success signal AND also tried "exporting" it outside the loop (Thinking it could be nice to address it in the loop's condition).
Guess it's easy but being a newbie, I had to call for some help :)
Attaching the relevant code (simplified).
Many thanks !
const fetch = require("node-fetch");
const url = "https://www.apitest12345.com/API/";
const headers = {
"LOGIN": "abcd",
"PASSWD": "12345"
}
const data = '<xml></xml>'
let i = 0;
do { // the loop
fetch(url, { method: 'POST', headers: headers, body: data})
.then((res) => {
return res.text()
})
.then((text) => {
console.log(text);
if(text.indexOf('555') > 0) { // if the response includes '555' it means SUCCESS, and we can stop the loop
~STOP!~ //help me stop the loop :)
}
});
i += 1;
} while (i < 20);
Use simple for loop with async await.
const fetch = require("node-fetch");
const url = "https://www.apitest12345.com/API/";
const headers = {
"LOGIN": "abcd",
"PASSWD": "12345"
}
const data = '<xml></xml>'
for (let i = 0; i < 20; i++) {
const res = await fetch(url, { method: 'POST', headers: headers, body: data});
if (res.text().indexOf('555') !== -1)
break;
}
I am trying to upload an image that I get from my webcam to the Microsoft Azure Face Api. I get the image from canvas.toDataUrl(‘image/png’) which contains the Data Uri. I change the Content Type to application/octet-stream and when I attach the Data Uri to the post request, I get a Bad Request (400) Invalid Face Image. If I change the attached data to a Blob, I stop receiving errors however I only get back an empty array instead of a JSON object. I would really appreciate any help for pointing me in the right direction.
Thanks!
Oh you're in such luck, i've just (successfully!) attempted this 2 days ago.
Sending base64-encoded JPEGs to Face API is seriously inefficient, The ratio of encoded output bytes to input bytes is 4:3 (33% overhead). Just send a byte array, it works, the docs mention it briefly.
And try to read as JPEG not PNG, that's just wasting bandwidth for webcam footage.
...
var dataUri = canvas.toDataURL('image/' + format);
var data = dataUri.split(',')[1];
var mimeType = dataUri.split(';')[0].slice(5)
var bytes = window.atob(data);
var buf = new ArrayBuffer(bytes.length);
var byteArr = new Uint8Array(buf);
for (var i = 0; i < bytes.length; i++) {
byteArr[i] = bytes.charCodeAt(i);
}
return byteArr;
Now use byteArr as your payload (data:) in $.ajax() for jQuery or iDontUnderStandHowWeGotHereAsAPeople() in any other hipster JS framework people use these days.
The reverse-hipster way of doing it is:
var payload = byteArr;
var xhr = new XMLHttpRequest();
xhr.open('POST', 'https://SERVICE_URL');
xhr.setRequestHeader('Content-Type', 'application/octet-stream');
xhr.send(payload);
To extend Dalvor's answer: this is the AJAX call that works for me:
fetch(data)
.then(res => res.blob())
.then(blobData => {
$.post({
url: "https://westus.api.cognitive.microsoft.com/face/v1.0/detect",
contentType: "application/octet-stream",
headers: {
'Ocp-Apim-Subscription-Key': '<YOUR-KEY-HERE>'
},
processData: false,
data: blobData
})
.done(function(data) {
$("#results").text(JSON.stringify(data));
})
.fail(function(err) {
$("#results").text(JSON.stringify(err));
})
Full demo code here: https://jsfiddle.net/miparnisari/b1zzpvye/
For saving someone's 6 hours, I appended my right code.
I hope this code helps you.
Tools
React
Typescript
React-webcam
Mac OS
Axios
Code
index.tsx
Constants and ref
/**
* Constants
*/
const videoConstraints = {
width: 1280,
height: 720,
facingMode: 'user',
};
/**
* Refs
*/
const webcamRef = React.useRef<Webcam>(null);
Call back function
const capture = React.useCallback(() => {
const base64Str = webcamRef.current!.getScreenshot() || '';
const s = base64Str.split(',');
const blob = b64toBlob(s[1]);
callCognitiveApi(blob);
}, [webcamRef]);
In render
<Webcam audio={false} ref={webcamRef} screenshotFormat="image/jpeg" videoConstraints={videoConstraints} />
<button onClick={capture}>Capture photo</button>
base64toBlob
Thanks to creating-a-blob-from-a-base64-string-in-javascript
export const b64toBlob = (b64DataStr: string, contentType = '', sliceSize = 512) => {
const byteCharacters = atob(b64DataStr);
const byteArrays = [];
for (let offset = 0; offset < byteCharacters.length; offset += sliceSize) {
const slice = byteCharacters.slice(offset, offset + sliceSize);
const byteNumbers = new Array(slice.length);
for (let i = 0; i < slice.length; i++) {
byteNumbers[i] = slice.charCodeAt(i);
}
const byteArray = new Uint8Array(byteNumbers);
byteArrays.push(byteArray);
}
const blob = new Blob(byteArrays, { type: contentType });
return blob;
};
callCognitiveApi
import axios from 'axios';
const subscriptionKey: string = 'This_is_your_subscription_key';
const url: string = 'https://this-is-your-site.cognitiveservices.azure.com/face/v1.0/detect';
export const callCognitiveApi = (data: any) => {
const config = {
headers: { 'content-type': 'application/octet-stream', 'Ocp-Apim-Subscription-Key': subscriptionKey },
};
const response = axios
.post(url, data, config)
.then((res) => {
console.log(res);
})
.catch((error) => {
console.error(error);
});
};
Result
So I got the answer finally by sending the image as a blob object. You first grab the image from canvas with:
let data = canvas.toDataURL('image/jpeg');
Afterwards, you can reformat it to a blob data object by running:
fetch(data)
.then(res => res.blob())
.then(blobData => {
// attach blobData as the data for the post request
}
You will also need to switch the Content-Type of the post request to "application/octet-stream"
I am reading JSON file using fs.readFileSync and for each document obtained, I am making a rest API call using client.post. Once I get response, I want to place the received content into another JSON file which is a replica of input JSON except additional element which is the data received from client.post call. However probably because of async nature of client.post, I am unable to add element to output JSON. I am new to NodeJS. Can you please help me where I am missing. Below is code and data
data:
[
{
"ticker": "CLYD"
},
{
"ticker": "EGH"
}
]
Code:
var fs = require('fs');
var Client = require('node-rest-client').Client;
var data = fs.readFileSync(__dirname + "/data/stocks.json", "utf8");
processData(data);
function processData (data) {
var obj = JSON.parse(data);
for (j = 0; j < obj.length; j++) {
obj[j].stockInformation = getValuesForTicker (obj[j].ticker.trim());
}
var jsonOutput = JSON.stringify(obj,null,'\t');
fs.writeFileSync(__dirname + "/data/response.json", jsonOutput);
};
function getValuesForTicker (ticker) {
/**
* More details and samples at https://www.npmjs.com/package/node-rest-client
*/
var client = new Client();
var values;
// set content-type header and data as json in args parameter
var args = {
data: { "ticker" : ticker},
headers: { "Content-Type": "application/json", "Accept" : "application/json" }
};
var responseToRequest = client.post("https://url.providing.response.as.json.content/", args, function (data, response) {
// parsed response body as js object
values = JSON.parse(JSON.stringify(data)).price;
});
return values;
};
Since getValueForTicker makes a async call to fetch data it should call a callback once data is recieved (or better a promise) and not return the result (currently undefined is returned as the value is returned before the value is assigned)
function getValuesForTicker (ticker, callback) {
/**
* More details and samples at https://www.npmjs.com/package/node-rest-client
*/
return new Promise(function(resolve, reject) {
var client = new Client();
var values;
// set content-type header and data as json in args parameter
var args = {
data: { "ticker" : ticker},
headers: { "Content-Type": "application/json", "Accept" : "application/json" }
};
var responseToRequest =
client.post("https://url.providing.response.as.json.content/", args, function (data, response) {
// parsed response body as js object
values = JSON.parse(JSON.stringify(data)).price;
resolve(values)
});
};
})
and to get the data once async call is done you will need to call then function as below:
getValuesForTicker(obj[j].ticker.trim())
.then(function(val) {
obj[j].stockInformation = val
})
Considering you are new to node.js it will be hard to get.Take some time to understand callback and promise first.