Print title importing from one location to another - excel

I've created a vba script to parse the title of diffetent posts along with the editing status of those posts from a website. What I wish to do now is let my script parse the title from it's landing page but print the title at the same time when it will print the editing status. I do not wish to create two subs for this task. I do not even know if it is possible in vba. However, if anything unclear please check out the comment within my script.
Sub ImportTitleFromAnotherLocation()
Const LINK$ = "https://stackoverflow.com/questions/tagged/web-scraping"
Const prefix$ = "https://stackoverflow.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument
Dim editInfo As Object, I&, targetUrl$, postTile$
With Http
.Open "GET", LINK, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".summary .question-hyperlink")
For I = 0 To .Length - 1
postTitle = .item(I).innerText 'I like this line to be transferred to the location below
targetUrl = Replace(.item(I).getAttribute("href"), "about:", prefix)
With Http
.Open "GET", targetUrl, False
.send
Html.body.innerHTML = .responseText
End With
R = R + 1: Cells(R, 1) = postTitle 'here I wish to use the above line like this
Set editInfo = Html.querySelector(".user-action-time > a")
If Not editInfo Is Nothing Then
Cells(R, 2) = editInfo.innerText
End If
Next I
End With
End Sub

You are overwriting your html document in the loop. A simple way would be to use a second htmldocument variable. A more verbose way would be to store the titles before the loop, for example in an array during an additional loop, then use your i variable to index into that to retrieve each title during the existing loop.
Sub ImportTitleFromAnotherLocation()
Const LINK$ = "https://stackoverflow.com/questions/tagged/web-scraping"
Const prefix$ = "https://stackoverflow.com"
Dim Http As New XMLHTTP60, Html As New HTMLDocument, Html2 As New HTMLDocument
Dim editInfo As Object, I&, targetUrl$, postTile$
Dim postTitle As String, r As Long
With Http
.Open "GET", LINK, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll(".summary .question-hyperlink")
For I = 0 To .Length - 1
postTitle = .item(I).innerText 'I like this line to be transferred to the location below
targetUrl = Replace$(.item(I).getAttribute("href"), "about:", prefix)
With Http
.Open "GET", targetUrl, False
.send
Html2.body.innerHTML = .responseText
End With
r = r + 1: ActiveSheet.Cells(r, 1) = postTitle 'here I wish to use the above line like this
Set editInfo = Html2.querySelector(".user-action-time > a")
If Not editInfo Is Nothing Then
ActiveSheet.Cells(r, 2) = editInfo.innerText
End If
Next I
End With
End Sub

Related

Problems While Scraping with VBA - VBA Web Scraping

I'm trying to get a number from a static page in a website, but when I do the HTML request, the result is a strange HTML without the informations of the original html that I want.
The website that I'm trying to get the information is:
https://fnet.bmfbovespa.com.br/fnet/publico/exibirDocumento?id=233361&cvm=true
but I can get the same result with:
https://fnet.bmfbovespa.com.br/fnet/publico/visualizarDocumento?id=233361&cvm=true
The number that I want to get is the number "0,05" in the page
My code is:
Sub trying()
Dim html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://fnet.bmfbovespa.com.br/fnet/publico/exibirDocumento?id=233361&cvm=true&", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT" 'to deal with potential caching
.send
html.body.innerHTML = .responseText
End With
Set element = html.getElementsByTagName("td")(31).innerText
Sheets("Sheet1").Cells(1, 1) = element
End Sub
I have also tried to do that using the InternetExplorer.Application but the the problem keeps the same
After trying a bunch of request headers, Accept request header is required to return the response in HTML:
Sub trying()
Dim html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "http://fnet.bmfbovespa.com.br/fnet/publico/exibirDocumento?id=233361&cvm=true&", False
.setRequestHeader "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
.send
html.body.innerHTML = .responseText
End With
Dim element As String
element = html.getElementsByTagName("td")(32).innerText
Sheets("Sheet1").Cells(1, 1) = element
End Sub
Just add a .htm (or .html) extension to the request to specify file type wanted.
Option Explicit
Public Sub trying()
Dim html As MSHTML.HTMLDocument
Set html = New MSHTML.HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://fnet.bmfbovespa.com.br/fnet/publico/exibirDocumento.htm?id=233361", False
.send
html.body.innerHTML = .responseText
End With
Debug.Print html.querySelector("tr:nth-child(6) .dado-valores").innerText
End Sub
Alternatives, that are a little more effort, include adding the accept header or base64 decoding the .responseText before writing to .innerHTML.
If you went down the less preferable base64 decode route, using function from here (note: pretty sure this is not the original source), then you will need to amend the following line:
.DataType = "bin.base64": .Text = Replace$(b64, Chr$(34), vbNullString) 'modified line
Public Function DecodeBase64(b64$)
Dim b
With CreateObject("Microsoft.XMLDOM").createElement("b64")
.DataType = "bin.base64": .Text = Replace$(b64, Chr$(34), vbNullString) 'modified line
b = .nodeTypedValue
With CreateObject("ADODB.Stream")
.Open: .Type = 1: .Write b: .Position = 0: .Type = 2: .Charset = "utf-8"
DecodeBase64 = .ReadText
.Close
End With
End With
End Function

How to skip a row in Excel with missing html tag using VBA

There are 15 objects listed on this website, each has a link under the photo. The 6th object has none. When extracting and transferring the content with my code the missing html-href is not skipped and in Excel, 14 hrefs are listed below each other (the 6th cell should remain empty or "no ducument"), but the last cell does (& error because 14<=>15). Unfortunately I have to keep my code structure and just need a loop or condition to complete it. Does anyone have any ideas? Thanks.
My incomplete code:
Public Sub GetData()
Dim html As New HTMLDocument
Dim elmt01 As Object, elmt02 As Object
Dim y As Long
Dim xURL As String
Set html = New MSHTML.HTMLDocument
xURL = "https://immobilienpool.de/suche/immobilien?page=1"
With CreateObject("MSXML2.XMLHTTP.6.0")
.Open "GET", xURL, False
.send
html.body.innerHTML = .responseText
End With
Set elmt01 = html.querySelectorAll("li[class*='contentBox']") '15 items
Set elmt02 = html.querySelectorAll("li a[title*='zusätzliche']") '14 hrefs
For y = 0 To elmt01.Length - 1
If InStr(elmt02, "pdf") Then 'better: If elmt02 exists in elmt01 then...
ActiveSheet.Cells(y + 1, 2) = elmt02.Item(y).href
Else
ActiveSheet.Cells(y + 1, 2) = "No document"
End If
Next
End Sub
The following script should solve the issue you are having. I had to modify your code to skip the blank row. I hope you will be able to comply with the current version:
Public Sub GetData()
Dim Html As HTMLDocument, HTMLDoc As HTMLDocument
Dim oPdfLink As Object, xURL As String, I As Long
Set Html = New MSHTML.HTMLDocument
Set HTMLDoc = New MSHTML.HTMLDocument
xURL = "https://immobilienpool.de/suche/immobilien?page=1"
With CreateObject("MSXML2.XMLHTTP.6.0")
.Open "GET", xURL, False
.send
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("li[class*='contentBox']")
For I = 0 To .Length - 1
HTMLDoc.body.innerHTML = .item(I).outerHTML
Set oPdfLink = HTMLDoc.querySelector("a[title*='zusätzliche']")
If Not oPdfLink Is Nothing Then
ActiveSheet.Cells(I + 1, 2) = oPdfLink.href
Else:
ActiveSheet.Cells(I + 1, 2) = "No document"
End If
Next I
End With
End Sub

Scrape Cargo Number Tracking Status with XMLHTTP Request with dynamic content

I have to create several functions that get the status of the supplied cargo number from each different website.
Below is the code user Zwenn helped me with. However, I am not familiar with the RegEx and Replace methods of VBA.
I am trying to simplify this code so I can replicate it for other websites. I understand that each website will need a unique code, but if the base stays the same and I can then modify the exact element needed to be scraped would be ideal.
Function FlightStat_AF(cargoNo As Variant) As String
Const url = "https://www.afklcargo.com/mycargo/api/shipment/detail/057-"
Dim elem As Object
Dim Result As String
Dim askFor As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url & cargoNo, False
.send
Result = .responseText
If .Status = 200 Then
If InStr(1, Result, "faultDescription") = 0 Then
askFor = """metaStatus"""
Else
askFor = """faultDescription"""
End If
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = askFor & ":(.*?),"
Set elem = .Execute(Result)
End With
Result = Replace(elem(0).SubMatches(0), Chr(34), "")
Else
Result = "No cargoID"
End If
End With
FlightStat_AF = Result
End Function
I am trying to create a similar function for the below website.
URL = https://booking.unitedcargo.com/skychain/app?service=page/nwp:Trackshipmt&doc_typ=AWB&awb_pre=016&awb_no=
Sample CargoNo = 60848034
The element to scrape is highlighted in yellow
The following should fetch you the required status as long as it is available.
Sub PrintStatus()
MsgBox GetDeliveryStat("60848034")
End Sub
Function GetDeliveryStat(cargoNo As Variant) As String
Const Url = "https://booking.unitedcargo.com/skychain/app?service=page/nwp:Trackshipmt&doc_typ=AWB&awb_pre=016&awb_no="
Dim dStatCheck$, deliveryStat$, S$
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", Url & cargoNo, False
.send
S = .responseText
End With
With CreateObject("HTMLFile")
.write S
On Error Resume Next
dStatCheck = .getElementById("trackShiptablerowInner0").getElementsByTagName("b")(0).innerText
On Error GoTo 0
If dStatCheck <> "" Then
deliveryStat = dStatCheck
Else
deliveryStat = "Not Found"
End If
End With
GetDeliveryStat = deliveryStat
End Function

Capture all the data

I have a question relating to HTML parsing. I would like to catch text from this site into my current spreadsheet, but code can only loop through each 1 page.
Sub Data()
Dim Http As New XMLHTTP60, Html As New HTMLDocument, topic As HTMLHtmlElement
With Http
.Open "GET", "https://voronezh.leroymerlin.ru/catalogue/dekorativnye-oboi/?sortby=1&display=90", False
.send
Html.body.innerHTML = .responseText
End With
For Each topic In Html.getElementsByClassName("ui-product-card__info")
With topic.getElementsByClassName("product-name")
If .Length Then x = x + 1: Cells(x, 1) = .item(0).innerText
End With
With topic.getElementsByClassName("main-value-part")
If .Length Then Cells(x, 2) = .item(0).innerText
End With
Next topic
End Sub
How can I loop the next page in the process to capture all the data?
Do you mean you want to get text from next pages on the website?
you can just continue in the way you do, but just loop through the page number:
Dim i as Integer
For i = 1 to 96
'Do here the same what you were doing, but replace your website string into:
"https://voronezh.leroymerlin.ru/catalogue/dekorativnye-oboi/?
display=90&sortby=1&page=" & i
Next i

VBA Scraping across multiple webpages

So, I have the following code to scrap the data from a website and it's working without any problem.
My "issue" now it's that I need to run the code trought multiple webpages because the website I'm scraping has a pagination script.
Eg: One single page has 48 records, but in most of the cases the page has 200+ records but they are sub-divided on 3/4 pages.
My code:
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100", False
.send
html.body.innerHTML = .responseText
End With
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
UPDATE
I've tried adding this For n = 1 To 2 before the With, it works but I need to know the exact number of pages so that's not so helpful..
Work out how many pages there are by dividing the result count by the results per page. Then do a loop concatenating the appropriate page number onto the url
Option Explicit
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
Const RESULTS_PER_PAGE As Long = 48
Const START_URL As String = "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", START_URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
Dim numPages As Long, numResults As Long, arr() As String
arr = Split(html.querySelector(".w-filters__element").innerText, Chr$(32))
numResults = arr(UBound(arr))
numPages = 1
If numResults > RESULTS_PER_PAGE Then
numPages = Application.RoundUp(numResults / RESULTS_PER_PAGE, 0)
End If
For i = 1 To numPages
If i > 1 Then
.Open "GET", Replace$("https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1", "page=1", "page=" & i), False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
End If
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Next
End With
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
Thinking about about what #AhmedAu said, if page has loaded properly, looks like a good way to also get page count is to simply use:
numPages = html.querySelectorAll("[data-page]").Length

Resources