Selector with Cheerio fails to retrieve children - node.js

I believe this one is a bug.
I am trying to write a simple web scraper with request and cheerio.
How I tried to solve it:
Yes, I played with other ways to define a selector.
Yes, I have investigated other stackoverflow questions.
Yes, I have created an issue on cheerio github, here is the link: https://github.com/cheeriojs/cheerio/issues/1252
Yes, I am a professional web developer and this is not the first time I do node.js
Update:
After some people pointed out, the issue was that needed dom nodes were created after my page was parsed and traversed by cheerio.
So the part of the page I requested simply was not there.
Any Ideas how to bypass that?
I use versions:
{
"name": "discont",
"version": "1.0.0",
"description": "Find when the item is on sale",
"main": "index.js",
"license": "MIT",
"devDependencies": {
"express": "^4.16.4"
},
"dependencies": {
"cheerio": "^1.0.0-rc.2",
"ejs": "^2.6.1",
"request": "^2.88.0"
}
}
This is the HTML I am trying to scrape:
The link is here:
https://www.asos.com/new-look-wide-fit/new-look-wide-fit-court-shoe/prd/10675413?clr=oatmeal&SearchQuery=&cid=6461&gridcolumn=1&gridrow=9&gridsize=4&pge=1&pgesize=72&totalstyles=826
This is my code:
request(url, options, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html, { withDomLvl1: false });
// console.log("product-price", $("div.product-price")[0].attribs);
console.log("product-price", $("div#product-price > div"));
}
});
The console.log returns an empty array(unable to find nested div).
This is what I get in return:
initialize {
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 0,
prevObject:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] } }
but if I change my code to
request(url, options, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html, { withDomLvl1: false });
// console.log("product-price", $("div.product-price")[0].attribs);
console.log("product-price", $("div#product-price"));
}
});
I get an array with a single element:
initialize {
'0':
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs:
{ class: 'product-price',
id: 'product-price',
'data-bind': 'component: { name: "product-price", params: {state: state, showGermanVatMessage: false }}' },
'x-attribsNamespace': { class: undefined, id: undefined, 'data-bind': undefined },
'x-attribsPrefix': { class: undefined, id: undefined, 'data-bind': undefined },
children: [],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object],
'x-attribsNamespace': [Object],
'x-attribsPrefix': [Object],
children: [Array],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 1,
prevObject:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] } }
yet, I am not able to see children of the element (the children array is empty), and I am not able to perform any methods on the object such as find() or text()
Any help is welcome!

Cheerio only has access to the DOM before any special things like XHRs have happened. You would need puppeteer or nightmarejs for the post-js-rendered DOM

Related

Retrieving an autocomplete value with cheerio

there's this website that conjugates korean verbs https://www.verbix.com/languages/korean. What i wanna do is type something in the input and then get the text of the suggestion that appears.
That blue part is the 'a' element's text
This is my code:
app.get("/conjugation", (req, res) => {
axios("https://www.verbix.com/languages/korean")
.then((response) => {
const htmlData = response.data;
const $ = cheerio.load(htmlData);
//Input Element
const input = $("#auto-complete-edit");
//Pass the value to it
input.attr("value", "먹어요");
//Get the selected element
const select = $(".selected");
select
.map((i, child) => {
//Get the 'a' element inner text
return $(child).find("a").text();
})
.get();
console.log(select);
})
.catch((err) => console.log(err));
});
And the console shows this
LoadedCheerio {
length: 0,
options: { xml: false, decodeEntities: true },
_root: <ref *1> LoadedCheerio {
'0': Document {
parent: null,
prev: null,
next: null,
startIndex: null,
endIndex: null,
children: [Array],
type: 'root',
'x-mode': 'no-quirks'
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: [Circular *1]
},
prevObject: <ref *2> LoadedCheerio {
'0': Document {
parent: null,
prev: null,
next: null,
startIndex: null,
endIndex: null,
children: [Array],
type: 'root',
'x-mode': 'no-quirks'
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: [Circular *2]
}
}

Cheerio just shows the first item instead of the list of children in div

I'm trying to get all items in a specific div that is already addressed.
But after using children() it only shows the first item instead of a list of whole children.
Here is the source:
And here is my code:
> const html = res.data;
> const $ = cheerio.load(html);
> const data = $('div[id="thepics"]');
> console.log(data);
And its result:
LoadedCheerio {
'0': <ref *1> Node {
type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype] {
style: 'margin:-5px 0px 10px 0px',
id: 'thepics'
},
'x-attribsNamespace': [Object: null prototype] { style: undefined, id: undefined },
'x-attribsPrefix': [Object: null prototype] { style: undefined, id: undefined },
children: [ [Node] ],
parent: Node {
type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype],
'x-attribsNamespace': [Object: null prototype],
'x-attribsPrefix': [Object: null prototype],
children: [Array],
parent: [Node],
prev: [Node],
next: [Node]
},
prev: Node {
type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype],
'x-attribsNamespace': [Object: null prototype],
'x-attribsPrefix': [Object: null prototype],
children: [],
parent: [Node],
prev: [Node],
next: [Circular *1]
},
next: Node {
type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype],
'x-attribsNamespace': [Object: null prototype],
'x-attribsPrefix': [Object: null prototype],
children: [Array],
parent: [Node],
prev: [Circular *1],
next: [Node]
}
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: <ref *2> LoadedCheerio {
'0': Node {
type: 'root',
name: 'root',
parent: null,
prev: null,
next: null,
children: [Array],
'x-mode': 'no-quirks'
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: [Circular *2]
},
prevObject: <ref *2> LoadedCheerio {
'0': Node {
type: 'root',
name: 'root',
parent: null,
prev: null,
next: null,
children: [Array],
'x-mode': 'no-quirks'
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: [Circular *2]
}
}
And after try to log data.children() the result is:
LoadedCheerio {
'0': Node {
type: 'script',
name: 'script',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype] {},
'x-attribsNamespace': [Object: null prototype] {},
'x-attribsPrefix': [Object: null prototype] {},
children: [ [Node] ],
parent: Node {
type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype],
'x-attribsNamespace': [Object: null prototype],
'x-attribsPrefix': [Object: null prototype],
children: [Array],
parent: [Node],
prev: [Node],
next: [Node]
},
prev: null,
next: null
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: <ref *1> LoadedCheerio {
'0': Node {
type: 'root',
name: 'root',
parent: null,
prev: null,
next: null,
children: [Array],
'x-mode': 'no-quirks'
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: [Circular *1]
},
prevObject: LoadedCheerio {
'0': Node {
type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object: null prototype],
'x-attribsNamespace': [Object: null prototype],
'x-attribsPrefix': [Object: null prototype],
children: [Array],
parent: [Node],
prev: [Node],
next: [Node]
},
length: 1,
options: { xml: false, decodeEntities: true },
_root: <ref *1> LoadedCheerio {
'0': [Node],
length: 1,
options: [Object],
_root: [Circular *1]
},
prevObject: <ref *1> LoadedCheerio {
'0': [Node],
length: 1,
options: [Object],
_root: [Circular *1]
}
}
}

How to read password protected PDF file in Nodejs and get it in buffer?

I tried using pdfjs-dist.
getting large json response.
var PDFJS=require('pdfjs-dist');
PDFJS.getDocument({ url: 'p1.pdf', password: '' }).then(function(pdf_doc)
{
console.log(pdf_doc);
}).catch(function(error) {
// incorrect password
// error is an object having 3 properties : name, message & code
});
Response
This is the whole response I am getting.
but I need response in buffer.
Can it be converted to buffer.
PDFDocumentProxy {
loadingTask:
{ _capability:
{ resolve: [Function], reject: [Function], promise: [Promise] },
_transport:
WorkerTransport {
messageHandler: [Object],
loadingTask: [Circular],
commonObjs: [Object],
fontLoader: [GenericFontLoader],
_params: [Object],
CMapReaderFactory: [DOMCMapReaderFactory],
destroyed: false,
destroyCapability: null,
_passwordCapability: null,
_networkStream: [PDFNodeStream],
_fullReader: [PDFNodeStreamFsFullReader],
_lastProgress: [Object],
pageCache: [],
pagePromises: [],
downloadInfoCapability: [Object],
numPages: 4,
pdfDocument: [Circular] },
_worker:
{ name: null,
destroyed: false,
postMessageTransfers: true,
verbosity: 1,
_readyCapability: [Object],
_port: [LoopbackPort],
_webWorker: null,
_messageHandler: [Object] },
docId: 'd0',
destroyed: false,
onPassword: null,
onProgress: null,
onUnsupportedFeature: null },
_pdfInfo:
{ numPages: 4,
fingerprint: '3432353738363537336c6e665361446f6f744f4a70' },
_transport:
WorkerTransport {
messageHandler:
{ sourceName: 'd0',
targetName: 'd0_worker',
comObj: [LoopbackPort],
callbackId: 1,
streamId: 1,
postMessageTransfers: true,
streamSinks: [Object],
streamControllers: [Object: null prototype] {},
callbacksCapabilities: [Object: null prototype] {},
actionHandler: [Object],
_onComObjOnMessage: [Function] },
loadingTask:
{ _capability: [Object],
_transport: [Circular],
_worker: [Object],
docId: 'd0',
destroyed: false,
onPassword: null,
onProgress: null,
onUnsupportedFeature: null },
commonObjs: { objs: [Object: null prototype] {} },
fontLoader:
GenericFontLoader {
docId: 'd0',
nativeFontFaces: [],
styleElement: null,
loadingContext: [Object],
loadTestFontId: 0 },
_params:
[Object: null prototype] {
url: 'p1.pdf',
password: '',
rangeChunkSize: 65536,
CMapReaderFactory: [Function: DOMCMapReaderFactory],
ignoreErrors: true,
pdfBug: false,
nativeImageDecoderSupport: 'none',
maxImageSize: -1,
isEvalSupported: true,
disableFontFace: true,
disableRange: false,
disableStream: false,
disableAutoFetch: false,
disableCreateObjectURL: false },
CMapReaderFactory: DOMCMapReaderFactory { baseUrl: null, isCompressed: false },
destroyed: false,
destroyCapability: null,
_passwordCapability: null,
_networkStream:
PDFNodeStream {
source: [Object],
url: [Url],
isHttp: false,
isFsUrl: true,
httpHeaders: {},
_fullRequest: [PDFNodeStreamFsFullReader],
_rangeRequestReaders: [Array] },
_fullReader:
PDFNodeStreamFsFullReader {
_url: [Url],
_done: false,
_storedError: null,
onProgress: [Function],
_contentLength: 112979,
_loaded: 112979,
_filename: null,
_disableRange: false,
_rangeChunkSize: 65536,
_isStreamingSupported: true,
_isRangeSupported: true,
_readableStream: [ReadStream],
_readCapability: [Object],
_headersCapability: [Object] },
_lastProgress: { loaded: 112979, total: 112979 },
pageCache: [],
pagePromises: [],
downloadInfoCapability:
{ resolve: [Function], reject: [Function], promise: [Promise] },
numPages: 4,
pdfDocument: [Circular] } }
*ignore below text*
efwrg rgsretg resgerstgh;ergh ;resjgysregh regjes powrjgu oiuueryoeq uieqroeqreqrilih ehr oiyeroeq ioiyeqroeq oieyqrioeq oieqyr oiyeqr oiyeqrp ioqyet oiehr oiyerh oieyreq oiyheqri iohereqk ioheqr qerioyqereq ioehqriheq rioqehriqeb ioeqrhpeq ioeqrhiqe ioqehriq ioqerhioq oirhqeipor oiqehrieq ioehqrq ioeqhrieq iohqerpq ieqhrpeq ioeqhrpeq iheqrpqe oiehrpqe ieqhrqierh ioeqhr ieqhr ioeqrh piqerh ieqhr iheqr piheqr ioheqr iheqr ioeqhrp ioqhre oieqhr oeqiyr qoeiryf pouqer poqure pouqr pouqre[q poquerq poqeur[q poqeur poqwuer poquer[ poqwur[wq poqr[ poqwhr powrq pow
You may open and read a password protected PDF like below. Working with your existing code:
var PDFJS = require('pdfjs-dist');
PDFJS.getDocument({ url: 'p1.pdf', password: '' }).then(function(pdf)
{
let text = [];
for(let i = 1; i <= pdf.numPages; i++) {
pdf.getPage(i).then(function(page) {
page.getTextContent().then(function(data) {
for(let j = 0; j < data.items.length; j++) {
text.push(data.items[j].str);
}
});
});
}
}).catch(function(error) {
// incorrect password
// error is an object having 3 properties : name, message & code
});

Aggregate function returns null GraphQL

I am testing a basic aggregation function using counts from Sequelize and here's my type Counts:
type Creserve {
id: ID!
rDateStart: Date!
rDateEnd: Date!
grade: Int!
section: String!
currentStatus: String!
user: User!
cartlab: Cartlab!
}
type Counts {
section: String!
count: Int
}
type Query {
getBooking(id: ID!): Creserve!
allBookings: [Creserve]
getBookingByUser(userId: ID): Creserve
upcomingBookings: [Creserve]
countBookings: [Counts]
}
I am using countBookings as my query for aggregate functions and here's my resolver for the query:
countBookings: async (parent, args, {models}) =>
{
const res = await models.Creserve.findAndCountAll({
group: 'section',
attributes: ['section', [Sequelize.fn('COUNT', 'section'), 'count']]
});
return res.rows;
},
The query that it outputs is this:
Executing (default): SELECT "section", COUNT('section') AS "count" FROM "Creserve" AS "Creserve" GROUP BY "section";
And tried this query in my psql shell and it's working fine:
section | count
---------+-------
A | 2
R | 2
However, when I tried querying countBookings in my GraphQL Playground, section is returned but not the count:
{
"data": {
"countBookings": [
{
"section": "A",
"count": null
},
{
"section": "R",
"count": null
}
]
}
}
Is there something I missed out? Or is this a bug? This is the answer I tried following to with this example: https://stackoverflow.com/a/45586121/9760036
Thank you very much!
edit: returning a console.log(res.rows) outputs something like this:
[ Creserve {
dataValues: { section: 'A', count: '2' },
_previousDataValues: { section: 'A', count: '2' },
_changed: {},
_modelOptions:
{ timestamps: true,
validate: {},
freezeTableName: true,
underscored: false,
underscoredAll: false,
paranoid: false,
rejectOnEmpty: false,
whereCollection: null,
schema: null,
schemaDelimiter: '',
defaultScope: {},
scopes: [],
indexes: [],
name: [Object],
omitNull: false,
hooks: [Object],
sequelize: [Sequelize],
uniqueKeys: {} },
_options:
{ isNewRecord: false,
_schema: null,
_schemaDelimiter: '',
raw: true,
attributes: [Array] },
__eagerlyLoadedAssociations: [],
isNewRecord: false },
Creserve {
dataValues: { section: 'R', count: '2' },
_previousDataValues: { section: 'R', count: '2' },
_changed: {},
_modelOptions:
{ timestamps: true,
validate: {},
freezeTableName: true,
underscored: false,
underscoredAll: false,
paranoid: false,
rejectOnEmpty: false,
whereCollection: null,
schema: null,
schemaDelimiter: '',
defaultScope: {},
scopes: [],
indexes: [],
name: [Object],
omitNull: false,
hooks: [Object],
sequelize: [Sequelize],
uniqueKeys: {} },
_options:
{ isNewRecord: false,
_schema: null,
_schemaDelimiter: '',
raw: true,
attributes: [Array] },
__eagerlyLoadedAssociations: [],
isNewRecord: false } ]
Here's for res.count:
Executing (default): SELECT "section", COUNT('section') AS "count" FROM "Creserve" AS "Creserve" GROUP BY "section";
[ { count: '2' }, { count: '2' } ]
Problem
Actually you are doing everything right here... but what is happening here is the sequlize doesn't return plain object... It always returns the data in form of instance like that
[ Creserve {
dataValues: { section: 'A', count: '2' },
_previousDataValues: { section: 'A', count: '2' },
_changed: {},
_modelOptions:
{ timestamps: true,
Solution
I am not sure but there is no other way instead of looping and makes
response to json object...
const array = []
res.rows.map((data) => {
array.push(data.toJSON())
})
return array

Writing A Mongoose Plug In- plugin() a method?

I am interested in writing a mongoose plug in to make all fields required. I know there are other ways to do this, but I like the idea of writing my own plug in.
From docs http://mongoosejs.com/docs/plugins:
// game-schema.js
var lastMod = require('./lastMod');
var Game = new Schema({ ... });
Game.plugin(lastMod, { index: true });
but when I create a model from my schema and look at the properties, I don't see a plugin() method:
var mongoose = require('mongoose');
var CpuSchema = require("../schemas/cpu");
var Cpu = mongoose.model('Cpu', CpuSchema);
console.log(Cpu);
module.exports = Cpu;
one#demo ~/cloudimageshare-monitoring/project $ node /home/one/cloudimageshare-monitoring/project/app/data/models/cpu.js
{ [Function: model]
base:
{ connections: [ [Object] ],
plugins: [],
models: { Cpu: [Circular] },
modelSchemas: { Cpu: [Object] },
options: { pluralization: true } },
modelName: 'Cpu',
model: [Function: model],
db:
{ base:
{ connections: [Object],
plugins: [],
models: [Object],
modelSchemas: [Object],
options: [Object] },
collections: { cpus: [Object] },
models: {},
replica: false,
hosts: null,
host: null,
port: null,
user: null,
pass: null,
name: null,
options: null,
otherDbs: [],
_readyState: 0,
_closeCalled: false,
_hasOpened: false,
_listening: false },
discriminators: undefined,
schema:
{ paths:
{ timeStamp: [Object],
avaiable: [Object],
status: [Object],
metrics: [Object],
_id: [Object],
__v: [Object] },
subpaths: {},
virtuals: { id: [Object] },
nested: {},
inherits: {},
callQueue: [],
_indexes: [],
methods: {},
statics: {},
tree:
{ timeStamp: [Object],
avaiable: [Function: Boolean],
status: [Function: String],
metrics: [Object],
_id: [Object],
id: [Object],
__v: [Function: Number] },
_requiredpaths: undefined,
discriminatorMapping: undefined,
_indexedpaths: undefined,
options:
{ id: true,
noVirtualId: false,
_id: true,
noId: false,
read: null,
shardKey: null,
autoIndex: true,
minimize: true,
discriminatorKey: '__t',
versionKey: '__v',
capped: false,
bufferCommands: true,
strict: true,
pluralization: true },
_events: {} },
options: undefined,
collection:
{ collection: null,
opts: { bufferCommands: true, capped: false },
name: 'cpus',
conn:
{ base: [Object],
collections: [Object],
models: {},
replica: false,
hosts: null,
host: null,
port: null,
user: null,
pass: null,
name: null,
options: null,
otherDbs: [],
_readyState: 0,
_closeCalled: false,
_hasOpened: false,
_listening: false },
queue: [ [Object] ],
buffer: true } }
Here on the model, I don't see a plugin() method.
The plugin method is defined on the Schema class and you can see it on your CpuSchema object.
On your Cpu model you can get it by calling
console.log(Cpu.schema.plugin)
From the mongoose source code on GitHub:
/**
* Registers a plugin for this schema.
*
* #param {Function} plugin callback
* #param {Object} opts
* #see plugins
* #api public
*/
Schema.prototype.plugin = function (fn, opts) {
fn(this, opts);
return this;
};
When you pass your plugin function to it it simply executes the function and passes the schema reference into it.

Resources