VBA Scraping across multiple webpages - excel

So, I have the following code to scrap the data from a website and it's working without any problem.
My "issue" now it's that I need to run the code trought multiple webpages because the website I'm scraping has a pagination script.
Eg: One single page has 48 records, but in most of the cases the page has 200+ records but they are sub-divided on 3/4 pages.
My code:
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100", False
.send
html.body.innerHTML = .responseText
End With
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
UPDATE
I've tried adding this For n = 1 To 2 before the With, it works but I need to know the exact number of pages so that's not so helpful..

Work out how many pages there are by dividing the result count by the results per page. Then do a loop concatenating the appropriate page number onto the url
Option Explicit
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
Const RESULTS_PER_PAGE As Long = 48
Const START_URL As String = "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", START_URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
Dim numPages As Long, numResults As Long, arr() As String
arr = Split(html.querySelector(".w-filters__element").innerText, Chr$(32))
numResults = arr(UBound(arr))
numPages = 1
If numResults > RESULTS_PER_PAGE Then
numPages = Application.RoundUp(numResults / RESULTS_PER_PAGE, 0)
End If
For i = 1 To numPages
If i > 1 Then
.Open "GET", Replace$("https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1", "page=1", "page=" & i), False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
End If
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Next
End With
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
Thinking about about what #AhmedAu said, if page has loaded properly, looks like a good way to also get page count is to simply use:
numPages = html.querySelectorAll("[data-page]").Length

Related

How to skip a row in Excel with missing html tag using VBA

There are 15 objects listed on this website, each has a link under the photo. The 6th object has none. When extracting and transferring the content with my code the missing html-href is not skipped and in Excel, 14 hrefs are listed below each other (the 6th cell should remain empty or "no ducument"), but the last cell does (& error because 14<=>15). Unfortunately I have to keep my code structure and just need a loop or condition to complete it. Does anyone have any ideas? Thanks.
My incomplete code:
Public Sub GetData()
Dim html As New HTMLDocument
Dim elmt01 As Object, elmt02 As Object
Dim y As Long
Dim xURL As String
Set html = New MSHTML.HTMLDocument
xURL = "https://immobilienpool.de/suche/immobilien?page=1"
With CreateObject("MSXML2.XMLHTTP.6.0")
.Open "GET", xURL, False
.send
html.body.innerHTML = .responseText
End With
Set elmt01 = html.querySelectorAll("li[class*='contentBox']") '15 items
Set elmt02 = html.querySelectorAll("li a[title*='zusätzliche']") '14 hrefs
For y = 0 To elmt01.Length - 1
If InStr(elmt02, "pdf") Then 'better: If elmt02 exists in elmt01 then...
ActiveSheet.Cells(y + 1, 2) = elmt02.Item(y).href
Else
ActiveSheet.Cells(y + 1, 2) = "No document"
End If
Next
End Sub
The following script should solve the issue you are having. I had to modify your code to skip the blank row. I hope you will be able to comply with the current version:
Public Sub GetData()
Dim Html As HTMLDocument, HTMLDoc As HTMLDocument
Dim oPdfLink As Object, xURL As String, I As Long
Set Html = New MSHTML.HTMLDocument
Set HTMLDoc = New MSHTML.HTMLDocument
xURL = "https://immobilienpool.de/suche/immobilien?page=1"
With CreateObject("MSXML2.XMLHTTP.6.0")
.Open "GET", xURL, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("li[class*='contentBox']")
For I = 0 To .Length - 1
HTMLDoc.body.innerHTML = .item(I).outerHTML
Set oPdfLink = HTMLDoc.querySelector("a[title*='zusätzliche']")
If Not oPdfLink Is Nothing Then
ActiveSheet.Cells(I + 1, 2) = oPdfLink.href
Else:
ActiveSheet.Cells(I + 1, 2) = "No document"
End If
Next I
End With
End Sub

Getting elements using multiple tags

I wish to pull data into excel from the following link: https://echa.europa.eu/registration-dossier/-/registered-dossier/13817/7/1 for Tox summaries for inhalation routes, dermal, eyes etc
The code below partly achieves this
Public Sub GetContents3()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
XMLReq.Open "Get", "https://echa.europa.eu/registration-dossier/-/registered-dossier/13817/7/1", False
XMLReq.send
HTMLDoc.body.innerHTML = XMLReq.responseText
Set SubSectList = HTMLDoc.getElementById("SectionContent")
Debug.Print SubSectList.innerText
End Sub
Although the code above works okay there are a few issues Id like to address.
For some reason the first Workers - Hazard via inhalation route is not being pulled.
I wish to use the tags dt and dd to refine how the information should be pulled. As an example of how I would like the data to be printed:
Workers - Hazard via inhalation route
Systemic effects
Long term exposure
Hazard assessment conclusion: DNEL (Derived No Effect Level)
Value: 1.41 mg/m³
... Etc (to include all data)
Workers - Hazard via dermal route
Systemic effects
Long term exposure
Hazard assessment conclusion: DNEL (Derived No Effect Level)
Value: 2.06 mg/kg bw/day
And So on for each Route.
To try and Achieve this I have the following code:
Public Sub GetContents3()
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
XMLReq.Open "Get", "https://echa.europa.eu/registration-dossier/-/registered-dossier/13817/7/1", False
XMLReq.send
HTMLDoc.body.innerHTML = XMLReq.responseText
Set SubSectList = HTMLDoc.getElementById("SectionContent")
Set SubSects = SubSectList.getElementsByTagName("dt")
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " " & SubSect.NextSibling.innerText
Next SubSect
End Sub
This is better but now it doesn't pull the subheadings.
I need someway to pull the information from multiple tags without overlapping data however I am unsure how to do this and so far attempts have just caused errors.
If anyone knows how I can modify the code to achieve the desired output above that would be great.
You can specify a css selector pattern to match the relevant tags, then during a loop over returned nodes, check the tagName, if DD or DT you need to combine into a single line for output:
Option Explicit
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://echa.europa.eu/registration-dossier/-/registered-dossier/13817/7/1", False
.send
html.body.innerHTML = .responseText
End With
Dim nodeList As MSHTML.IHTMLDOMChildrenCollection, i As Long, r As Long, concat As String
Set nodeList = html.querySelectorAll("#SectionContent h3,#SectionContent h4,#SectionContent h5,#SectionContent h6, #SectionContent dt, #SectionContent dd")
For i = 1 To nodeList.Length - 1
Select Case nodeList.Item(i).tagName
Case "DT"
concat = nodeList.Item(i).innerText
Case "DD"
concat = concat & Chr$(32) & nodeList.Item(i).innerText
r = r + 1
ActiveSheet.Cells(r, 1) = concat
Case Else
concat = vbNullString
r = r + 1
ActiveSheet.Cells(r, 1) = nodeList.Item(i).innerText
End Select
Next
End Sub
Now, I would like something tidier than the requested list, so how about the following tabulation?
Option Explicit
Public Sub GetContents()
Dim http As MSXML2.XMLHTTP60, html As MSHTML.HTMLDocument
Set http = New MSXML2.XMLHTTP60: Set html = New MSHTML.HTMLDocument
With http
.Open "GET", "https://echa.europa.eu/registration-dossier/-/registered-dossier/13817/7/1", False
.send
html.body.innerHTML = .responseText
End With
Dim nodeList As MSHTML.IHTMLDOMChildrenCollection, i As Long
Dim c As Long, r As Long, tag As String
Set nodeList = html.querySelectorAll("#SectionContent h3,#SectionContent h4,#SectionContent h5,#SectionContent h6, #SectionContent dt, #SectionContent dd")
r = 1
For i = 1 To nodeList.Length - 1
tag = nodeList.Item(i).tagName
Select Case tag
Case "DT"
c = 5
Case "DD"
c = 6
r = r + 1
Case Else
c = Right$(tag, 1) - 2
End Select
ActiveSheet.Cells(IIf(tag = "DD", r - 1, r), c) = nodeList.Item(i).innerText
Next
End Sub

Webscrape inside a for loop - Follow up

Following my previous question Webscrape VBA with condition, I started trying to automate the procedure for a list of url from this website here that I prepared in my excel document. When I tried for 20 and 30 url it worked perfectly, yet when I increased it, a "Script out of range error" occurred concerning the ReDim in the GetNodesTextAsArray, do you have any idea why ?
After some research I Tried to replace it by a for loop but it doesn't chagne anything.
Public Sub WindInfo()
'VBE> Tools > References:
'1. Microsoft, XML v6
'2. Microsoft HTML Object Library
'3. Microsoft Scripting Runtime
Dim xhr As MSXML2.XMLHTTP60: Set xhr = New MSXML2.XMLHTTP60
Dim html As MSHTML.HTMLDocument: Set html = New MSHTML.HTMLDocument
Dim ws As Worksheet: Set ws = ThisWorkbook.Worksheets("Sheet1")
Dim url As String
Dim j As Integer
Dim r As Long
r = 1
For j = 1 To 20
url = Worksheets("List").Cells(j, 1).Value
With xhr
.Open "GET", url, False
.send
html.body.innerHTML = .responseText
End With
Dim generalities As Object, arrGen(), partsList As Object
Set generalities = html.querySelectorAll("#bloc_texte table ~ table li")
arrGen = GetNodesTextAsArray(generalities)
Dim parts As Object, numberOfParts As Long
Set partsList = html.querySelectorAll("h1 ~ h3, ul ~ h3")
If partsList.Length > 0 Then
numberOfParts = html.querySelectorAll("h1 ~ h3, ul ~ h3").Length / 2
Set parts = html.querySelectorAll("h3 + ul")
Dim i As Long, liNodes As Object, arr()
Dim html2 As MSHTML.HTMLDocument: Set html2 = New MSHTML.HTMLDocument
For i = 0 To numberOfParts - 1
ws.Cells(r, 1).Resize(1, UBound(arrGen)) = arrGen
html2.body.innerHTML = parts.Item(i).outerHTML & parts.Item(i + numberOfParts).outerHTML
Set liNodes = html2.querySelectorAll("li")
arr = GetNodesTextAsArray(liNodes)
ws.Cells(r, 5).Resize(1, UBound(arr)) = arr
r = r + 1
Next
Else
arr = GetNodesTextAsArray(html.querySelectorAll("#bloc_texte h1 + ul").Item(1).getElementsByTagName("li"))
ws.Cells(r, 1).Resize(1, UBound(arrGen)) = arrGen
ws.Cells(r, 5).Resize(1, UBound(arr)) = arr
r = r + 1
End If
Application.Wait (Now + TimeValue("0:00:01"))
Next
End Sub
Public Function GetNodesTextAsArray(ByVal nodeList As Object) As Variant()
Dim i As Long, results()
ReDim results(1 To nodeList.Length)
For i = 0 To nodeList.Length - 1
results(i + 1) = nodeList.Item(i).innerText
Next i
GetNodesTextAsArray = results
End Function
9/10 the problem with VBA webscraping is page loading related.
So what you want to do is try an error handler that Loops while the page loads, and then tests to see if the element appears on the page.
I'll leave the code to you, but the timing here is essentially everything since even status code checking and waiting for the document to be loaded checking, is not a guarantee everything you need is loaded.
Even with great tools like selenium and puppeteer you still need to check / handle element loading and timing, and cant just assume, since document loaded, the element is there.
makes sense?

Print title importing from one location to another

I've created a vba script to parse the title of diffetent posts along with the editing status of those posts from a website. What I wish to do now is let my script parse the title from it's landing page but print the title at the same time when it will print the editing status. I do not wish to create two subs for this task. I do not even know if it is possible in vba. However, if anything unclear please check out the comment within my script.
Sub ImportTitleFromAnotherLocation()
Const LINK$ = "https://stackoverflow.com/questions/tagged/web-scraping"
Const prefix$ = "https://stackoverflow.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim editInfo As Object, I&, targetUrl$, postTile$
With Http
.Open "GET", LINK, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".summary .question-hyperlink")
For I = 0 To .Length - 1
postTitle = .item(I).innerText 'I like this line to be transferred to the location below
targetUrl = Replace(.item(I).getAttribute("href"), "about:", prefix)
With Http
.Open "GET", targetUrl, False
.send
Html.body.innerHTML = .responseText
End With
R = R + 1: Cells(R, 1) = postTitle 'here I wish to use the above line like this
Set editInfo = Html.querySelector(".user-action-time > a")
If Not editInfo Is Nothing Then
Cells(R, 2) = editInfo.innerText
End If
Next I
End With
End Sub
You are overwriting your html document in the loop. A simple way would be to use a second htmldocument variable. A more verbose way would be to store the titles before the loop, for example in an array during an additional loop, then use your i variable to index into that to retrieve each title during the existing loop.
Sub ImportTitleFromAnotherLocation()
Const LINK$ = "https://stackoverflow.com/questions/tagged/web-scraping"
Const prefix$ = "https://stackoverflow.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Html2 As New HTMLDocument
Dim editInfo As Object, I&, targetUrl$, postTile$
Dim postTitle As String, r As Long
With Http
.Open "GET", LINK, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".summary .question-hyperlink")
For I = 0 To .Length - 1
postTitle = .item(I).innerText 'I like this line to be transferred to the location below
targetUrl = Replace$(.item(I).getAttribute("href"), "about:", prefix)
With Http
.Open "GET", targetUrl, False
.send
Html2.body.innerHTML = .responseText
End With
r = r + 1: ActiveSheet.Cells(r, 1) = postTitle 'here I wish to use the above line like this
Set editInfo = Html2.querySelector(".user-action-time > a")
If Not editInfo Is Nothing Then
ActiveSheet.Cells(r, 2) = editInfo.innerText
End If
Next I
End With
End Sub

Object does not support this property of method, while parsing html document

I want to scrap every post heading form this blog. I am using the code bellow but it's giving me an error "Run time error 438 object does not support this property or method" in line
Cells(i, 1).Value = ele.getElementsByClassName("entry-title")(0).getElementsByTagName("a")(0).innerText
The code is:
Private Sub CommandButton1_Click()
Dim bot As Object
Dim doc As New HTMLDocument
Dim ele As HTMLElementCollection
Dim i As Long
Set bot = CreateObject("MSXML2.XMLHTTP")
bot.Open "GET", "http://themakeupblogger.com/makeup/", False
bot.send
doc.body.innerHTML = bot.responseText
For Each ele In doc.getElementsByTagName("article")
i = Cells(Rows.Count, 1).End(xlUp).Row + 1
Cells(i, 1).Value = ele.getElementsByClassName("entry-title")(0).getElementsByTagName("a")(0).innerText
Next ele
End Sub
Give this a shot and get all the titles you are after.
Sub demo()
Dim http As New XMLHTTP60, html As New HTMLDocument
Dim r As Long, ele As Object
With http
.Open "GET", "http://themakeupblogger.com/makeup/", False
.send
html.body.innerHTML = .responseText
End With
For Each elem In html.getElementsByClassName("entry-title")
With elem.getElementsByTagName("a")
If .Length Then r = r + 1: Cells(r, 1) = .Item(0).innerText
End With
Next elem
End Sub
Reference to add to the library:
1. Microsoft XML, v6.0
2. Microsoft HTML Object Library
Partial results:
4 High-Coverage Foundations That Might As Well Be Skincare
10 Memorial Day Beauty Essentials That Belong In Your Beach Bag
Don’t Get Married Without These Wedding Day Makeup Tips (Courtesy of a Makeup Artist)
To get the articles from that page you can do something like:
Sub demo()
Dim http As New InternetExplorer, html As New HTMLDocument
Dim r As Long, elem As Object
With http
.Visible = False
.navigate "http://themakeupblogger.com/makeup/"
Do Until .readyState = READYSTATE_COMPLETE: Loop
Set html = .document
End With
For Each elem In html.getElementsByTagName("article")
With elem.getElementsByTagName("h1")
If .Length Then r = r + 1: Cells(r, 1) = .Item(0).getElementsByTagName("a")(0).innerText
End With
With elem.getElementsByTagName("div")(3).getElementsByTagName("p")
If .Length Then Cells(r, 2) = .Item(0).innerText
End With
Next elem
End Sub
This time the reference you should add to the library:
1. Microsoft Internet Controls
2. Microsoft HTML Object Library

Resources