How to scrape a table? - excel

The website I am trying to scrape: https://apps.nl.tele2.nl/zakelijk/postcodecheck/business-extern
It is a tool that scrapes zip codes + availabilities
Sub Pulldatafromweb()
Dim IE As Object
Dim doc As HTMLDocument
Dim ieobj As InternetExplorer
Dim htmlEle As IHTMLElement
Dim i As Integer
i = 1
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
IE.navigate "https://apps.nl.tele2.nl/zakelijk/postcodecheck/business-extern"
Do While IE.Busy Or IE.readyState <> 4
Application.Wait DateAdd("s", 1, Now)
Loop
Set doc = IE.document
doc.getElementById("postcode").Value = ThisWorkbook.Sheets("sheet2").Range("A3").Value
doc.getElementById("housenumber").Value = ThisWorkbook.Sheets("sheet2").Range("b3").Value
doc.getElementById("submit").Click
For Each htmlelement In ieobj.document.getElementsByClassName("wrapper")(0).getElementsByTagName("TR")
With ActiveSheet
.Range("A" & i).Value = htmlEle.Children(0).textContent
.Range("B" & i).Value = htmlEle.Children(1).textContent
.Range("C" & i).Value = htmlEle.Children(2).textContent
End With
i = i + 1
Next
End Sub
It seems there is an issue with the name of the table.

You appear to be looking for the wrong element, the table you want has the classname 'infotable'.
This code returns the data from that table.
Sub Pulldatafromweb()
Dim IE As Object
Dim doc As HTMLDocument
Dim ieobj As InternetExplorer
Dim htmlElement As IHTMLElement
Dim i As Long
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
IE.navigate "https://apps.nl.tele2.nl/zakelijk/postcodecheck/business-extern"
Do While IE.Busy Or IE.readyState <> 4
Application.Wait DateAdd("s", 1, Now)
Loop
Set doc = IE.document
doc.getElementById("postcode").Value = ThisWorkbook.Sheets("sheet2").Range("A3").Value
doc.getElementById("housenumber").Value = ThisWorkbook.Sheets("sheet2").Range("b3").Value
doc.getElementById("submit").Click
Set doc = IE.document
For Each htmlElement In doc.getElementsByClassName("infotable")(0).getElementsByTagName("tr")
i = i + 1
With ActiveSheet
.Range("A" & i).Value = htmlElement.Children(0).textContent
.Range("B" & i).Value = htmlElement.Children(1).textContent
.Range("C" & i).Value = htmlElement.Children(2).textContent
End With
Next htmlElement
End Sub

Related

Use Excel VBA to Extract Data From a Webpage

I'm trying to use Excel VBA to extract some data from a webpage (https://www.churchofjesuschrist.org/maps/meetinghouses/lang=eng&q=1148+W+100+N). The code I'm using will open Internet Explorer, navigate to the website, and it will extract the top most result. But I can't seem to figure out how to extract the rest of the results (i.e. ward, language, contact name, contact #). Thoughts?
Sub MeethinghouseLocator()
Dim IE As Object
Set IE = CreateObject("InternetExplorer.Application")
IE.navigate Sheets("Sheet1").Range("A1").Value
IE.Visible = True
While IE.Busy
DoEvents
Wend
Application.Wait (Now + TimeValue("0:00:01"))
IE.document.querySelector("button.search-input__execute.button--primary").Click
Dim Doc As HTMLDocument
Set Doc = IE.document
Application.Wait (Now + TimeValue("0:00:01"))
'WardName
Dim aaaaFONT As String
aaaaFONT = Trim(Doc.getElementsByClassName("location-header__name ng-binding")(0).innerText)
Sheets("Sheet1").Range("D6").Value = aaaaFONT
Application.Wait (Now + TimeValue("0:00:01"))
'Language
Dim aaabFONT As String
aaabFONT = Trim(Doc.getElementsByClassName("location-header__language ng-binding ng-scope")(0).innerText)
Sheets("Sheet1").Range("E6").Value = aaabFONT
'Click 1st Link
IE.document.getElementsByClassName("location-header__name ng-binding")(0).Click
Application.Wait (Now + TimeValue("0:00:01"))
'Contact Name
Dim aaacFONT As String
aaacFONT = Trim(Doc.getElementsByClassName("maps-card__group maps-card__group--inline ng-scope")(2).innerText)
Sheets("Sheet1").Range("H6").Value = aaacFONT
'Contact Name Function
Range("F6").Select
ActiveCell.FormulaR1C1 = _
"=LEFT(RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3),FIND(RIGHT(RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3),LEN(RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3))-FIND(CHAR(10),RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3))),RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3))-1)"
'Contact Phone Number
Dim aaadFONT As String
aaadFONT = Trim(Doc.getElementsByClassName("phone ng-binding")(0).innerText)
Sheets("Sheet1").Range("G6").Value = aaadFONT
IE.Quit
End Sub
Most of your code works actually so I'm not sure what issue are you facing but you didn't account for the loading after clicking each link so I have added While loop to check for its Ready and ReadyState property before continuing.
EDIT: The code now loops through all the wards listed in the result, the idea is to keep the first IE at the result page and pass the URL of the ward and the input row to sub ExtractWard where it will open another IE, navigate to the given URL and extract the ward details.
Sub MeethinghouseLocator()
Dim IE As Object
Set IE = CreateObject("InternetExplorer.Application")
IE.navigate Sheets("Sheet1").Range("A1").Value
IE.Visible = True
While IE.Busy Or IE.readyState <> 4
DoEvents
Wend
IE.document.querySelector("button.search-input__execute.button--primary").Click
While IE.Busy Or IE.readyState <> 4
DoEvents
Wend
Dim Doc As HTMLDocument
Set Doc = IE.document
Application.Wait (Now + TimeValue("0:00:01"))
Dim wardContent As Object
Set wardContent = Doc.getElementsByClassName("maps-card__content")(2)
Dim wardCollection As Object
Set wardCollection = wardContent.getElementsByClassName("location-header")
Dim rowNum As Long
rowNum = 6
Dim i As Long
For i = 0 To wardCollection.Length - 1
With wardCollection(i)
'WardName
Dim aaaaFONT As String
aaaaFONT = Trim(.getElementsByClassName("location-header__name ng-binding")(0).innerText)
Sheets("Sheet1").Cells(rowNum, "D").Value = aaaaFONT
'Language
Dim aaabFONT As String
aaabFONT = Trim(.getElementsByClassName("location-header__language ng-binding ng-scope")(0).innerText)
Sheets("Sheet1").Cells(rowNum, "E").Value = aaabFONT
Dim wardURL As String
wardURL = .getElementsByClassName("location-header__name ng-binding")(0).href
ExtractWard wardURL, rowNum
End With
rowNum = rowNum + 1
Next i
Set Doc = Nothing
IE.Quit
Set IE = Nothing
End Sub
Private Sub ExtractWard(argURL As String, argRow As Long)
Dim IE As Object
Set IE = CreateObject("InternetExplorer.Application")
IE.navigate argURL
IE.Visible = True
While IE.Busy Or IE.readyState <> 4
DoEvents
Wend
Dim Doc As HTMLDocument
Set Doc = IE.document
'Contact Name
Dim aaacFONT As String
aaacFONT = Trim(Doc.getElementsByClassName("maps-card__group maps-card__group--inline ng-scope")(2).innerText)
Sheets("Sheet1").Cells(argRow, "H").Value = aaacFONT
'Contact Name Function
Sheets("Sheet1").Cells(argRow, "F").FormulaR1C1 = _
"=LEFT(RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3),FIND(RIGHT(RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3),LEN(RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3))-FIND(CHAR(10),RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3))),RIGHT(RC[2],LEN(RC[2])-FIND(CHAR(10),RC[2])-3))-1)"
'Contact Phone Number
Dim aaadFONT As String
aaadFONT = Trim(Doc.getElementsByClassName("phone ng-binding")(0).innerText)
Sheets("Sheet1").Cells(argRow, "G").Value = aaadFONT
Set Doc = Nothing
IE.Quit
Set IE = Nothing
End Sub

Web scraping with getElementsByTagName()

I want to import restaurant data like Restaurant name, phone number, website & address to excel but unfortunately I am getting ads & garbage data. I have created a code using http://automatetheweb.net/vba-getelementsbytagname-method/ website but it is not helping out. Please rectify the issue in my code.
Website:https://www.yellowpages.com/atlanta-ga/attorneys
Please donot refer json as it is not working on other webs.
Sub Yellowcom()
'Dim ieObj As InternetExplorer
Dim htmlELe As IHTMLElement
Dim HTML As HTMLDocument
Dim i As Integer
Dim URL As String
Dim URLParameter As String
Dim page As Long
Dim links As Object
Dim IE As Object
i = 1
Set IE = CreateObject("InternetExplorer.Application")
'Set ieObj = New InternetExplorer
IE.Visible = True
URL = "https://www.yellowpages.com/atlanta-ga/attorneys"
'Application.Wait Now + TimeValue("00:00:05")
For page = 2 To 4
If page > 1 Then URLParameter = "?page=" & page
IE.navigate URL & URLParameter
' Wait for the browser to load the page
Do Until IE.readyState = 4
DoEvents
Loop
Set HTML = IE.document
Set links = HTML.getElementsByClassName("info")
For Each htmlELe In links
With ActiveSheet
.Range("A" & i).Value = htmlELe.Children(0).textContent
.Range("B" & i).Value = htmlELe.getElementsByTagName("a")(0).href
.Range("C" & i).Value = htmlELe.Children(2).textContent
.Range("D" & i).Value = htmlELe.Children(2).querySelector("a[href]")
'links2 = htmlELe.getElementsByClassName("links")(1)
' .Range("D" & i).Value = links2.href
End With
i = i + 1
Next htmlELe
Next page
IE.Quit
Set IE = Nothing
End Sub
Required Output should be like this
I would use xhr rather than a browser and store data in an array for each page and write that out to sheet. You could really dimension one array to hold all results in advance based on results per page and number of pages but the below is still efficient
Option Explicit
Public Sub GetListings()
Dim html As HTMLDocument, page As Long, html2 As HTMLDocument
Dim results As Object, headers(), ws As Worksheet, i As Long
Const START_PAGE As Long = 1
Const END_PAGE As Long = 2
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("Name", "Phone", "Website", "Address")
Application.ScreenUpdating = False
Set html = New HTMLDocument
Set html2 = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
With CreateObject("MSXML2.XMLHTTP")
For page = START_PAGE To END_PAGE
.Open "GET", "https://www.yellowpages.com/atlanta-ga/attorneys?page=" & page, False
.send
html.body.innerHTML = .responseText
Set results = html.querySelectorAll(".organic .result")
Dim output(), r As Long
ReDim output(1 To results.Length, 1 To 4)
r = 1
For i = 0 To results.Length - 1
On Error Resume Next
html2.body.innerHTML = results.item(i).outerHTML
output(r, 1) = html2.querySelector(".business-name").innerText
output(r, 2) = html2.querySelector(".phone").innerText
output(r, 3) = html2.querySelector(".track-visit-website").href
output(r, 4) = html2.querySelector(".street-address").innerText & " " & html2.querySelector(".locality").innerText
On Error GoTo 0
r = r + 1
Next
ws.Cells(GetLastRow(ws, 1) + 1, 1).Resize(UBound(output, 1), UBound(output, 2)) = output
Next
End With
Application.ScreenUpdating = True
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.rows.Count, columnNumber).End(xlUp).Row
End With
End Function
Sample of output:
The info class is also used for the advertisements. You first need to go to the collection where the classname is "search-results organic" and in there find all the "info" classes.
This means that you need an extra collection variable:
Set HTML = IE.document
Set OrganicLinks = HTML.getElementsByClassName("search-results organic")
Set links = OrganicLinks.item(0).getElementsByClassName("info")
For getting the right website, you need to use another reference. It's better to get it by classname, since that one is more unique:
On Error Resume Next
.Range("B" & i).Value = htmlELe.getElementsByClassName("track-visit-website")(0).href
On Error GoTo 0

Extracting HTML code from websites by looping through list of URLs

I'm using Excel VBA to launch an IE browser tab based on the URL in each of the rows in column D. Then the relevant HTML code is extracted based on pre-defined classes and populated in columns A - C.
Pretty sure I missed a step. The process stops at D2 and does not proceed to extract HTML from the next URLs (in cells D3, D4, etc).
Thanks in advance for any suggestions!
Sub useClassnames()
Dim element As IHTMLElement
Dim elements As IHTMLElementCollection
Dim IE As InternetExplorer
Dim html As HTMLDocument
Dim shellWins As New ShellWindows
Dim IE_TabURL As String
Dim intRowPosition As Integer
Set IE = New InternetExplorer
IE.Visible = False
intRowPosition = 2
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
IE.navigate Sheet1.Range("D" & intRowPosition)
While IE.Busy
DoEvents
Wend
intRowPosition = intRowPosition + 1
While Sheet1.Range("D" & intRowPosition) <> vbNullString
IE.navigate Sheet1.Range("D" & intRowPosition), CLng(2048)
While IE.Busy
DoEvents
Wend
intRowPosition = intRowPosition + 1
Wend
Do While IE.readyState <> READYSTATE_COMPLETE
Application.StatusBar = "Loading Web pageā€¦"
DoEvents
Loop
Set html = IE.document
Set elements = html.getElementsByClassName("container-bs")
Dim count As Long
Dim erow As Long
count = 0
For Each element In elements
If element.className = "container-bs" Then
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = html.getElementsByClassName("pull-left")(count + 1).innerHTML
Cells(erow, 2) = html.getElementsByClassName("description")(count).innerHTML
Cells(erow, 3) = html.getElementsByClassName("related-articles")(count).innerHTML
count = count + 1
End If
Next element
Range("A2:C2000").Select
Columns("A:A").EntireColumn.AutoFit
Columns("B:B").ColumnWidth = 36
End Sub
Your lines
Set html = IE.document
Set elements = html.getElementsByClassName("container-bs")
etc happen after the While loop. It needs to be inside.
Your If statement:
If element.className = "container-bs"
should be redundant as you are already looping over a collection of that classname; so I have removed this.
You are not working off element in the loop, so essentially you are using it to control your incremented counter variable. This suggests you can use a better coding strategy for retrieving the items of interest.
Always state the parent worksheet and don't rely on implicit Activesheet references - that is bug prone.
I would expect a structure more like as follows (I cannot account for refactoring to remove element)
Option Explicit
Public Sub UseClassnames()
Dim element As IHTMLElement, elements As IHTMLElementCollection, ie As InternetExplorer
Dim html As HTMLDocument, intRowPosition As Long
intRowPosition = 2
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = True
While Sheet1.Range("D" & intRowPosition) <> vbNullString
If intRowPosition = 2 Then
ie.navigate Sheet1.Range("D" & intRowPosition)
Else
ie.navigate Sheet1.Range("D" & intRowPosition), CLng(2048)
End If
While ie.Busy Or ie.readyState < 4: DoEvents: Wend
Set html = ie.document
Set elements = html.getElementsByClassName("container-bs")
Dim count As Long, erow As Long
count = 0
For Each element In elements
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
With Sheet1
.Cells(erow, 1) = html.getElementsByClassName("pull-left")(count + 1).innerHTML
.Cells(erow, 2) = html.getElementsByClassName("description")(count).innerHTML
.Cells(erow, 3) = html.getElementsByClassName("related-articles")(count).innerHTML
End With
count = count + 1
Next element
intRowPosition = intRowPosition + 1
Wend
With Sheet1
.Range("A2:C2000").Select
.Columns("A:A").EntireColumn.AutoFit
.Columns("B:B").ColumnWidth = 36
End With
End Sub

How to Improve Data Scraping Using VBA?

I have below code, which fetch data from a Intranet. But its taking more time to fetch data.can someone help me to modify the code to increase the performance.
Thanks In Advance
Note-I haven't posted URL as it is clients website. Sorry about that.
Sub FetchData()
Dim IE As Object
Dim Doc As HTMLDocument
Dim myStr As String
On Error Resume Next
Set IE = CreateObject("InternetExplorer.Application") 'SetBrowser
IE.Visible = False
IE.navigate "URL" 'Open website
Do While IE.Busy Or IE.ReadyState <> 4
Application.Wait DateAdd("s", 1, Now)
Loop
Set Doc = IE.Document
Doc.getElementById("tbxUserID").Value = InputBox("Please Enter Your ID")
Doc.getElementById("txtPassword").Value = InputBox("Please Enter Your
Password")
Doc.getElementById("BtnLogin").Click
Do While IE.Busy Or IE.ReadyState <> 4
Application.Wait DateAdd("s", 1, Now)
Loop
IE.navigate "URL"
Do While IE.Busy Or IE.ReadyState <> 4
Application.Wait DateAdd("s", 1, Now)
Loop
Dim LastRow As Long
Set wks = ActiveSheet
LastRow = wks.Cells(wks.Rows.Count, "A").End(xlUp).Row
Set rowNo = wks.Range("A1:A" & LastRow)
For rowNo = 2 To LastRow
Doc.getElementById("txtField1").Value =
ThisWorkbook.Sheets("Sheet1").Range("A" & rowNo).Value
Doc.getElementById("CtrlQuickSearch1_imgBtnSumbit").Click
Do While IE.Busy Or IE.ReadyState <> 4
Application.Wait DateAdd("s", 1, Now)
Loop
strVal1 = Doc.querySelectorAll("span")(33).innerText
ThisWorkbook.Sheets("Sheet1").Range("B" & rowNo).Value = strVal1
strVal2 = Doc.querySelectorAll("span")(35).innerText
ThisWorkbook.Sheets("Sheet1").Range("C" & rowNo).Value = strVal2
Next
End Sub
Can't guarantee this will run. Points to note:
Use of Worksheets collection
Use of Option Explicit - this means you then have to use the right datatype throughout. Currently you have undeclared variables and, for example, rowNo is used as a Long and as a range.
Removal of On Error Resume Next
Putting all worksheets into variables
Placement of values into array and looping array to get id values. Looping sheet is expensive
Use of early binding and adding class to InternetExplorer
Assumption that after login a new url present and that you need to navigate back to that before each new loop value
Removal of hungarian notation
Ids are fastest selector method so no improvement there
With your css type selectors, e.g. .document.querySelectorAll("span")(33), you might seek whether there is a single node short selector that can be used, rather than using nodeList
VBA:
Option Explicit
Public Sub FetchData()
Dim ie As Object, ie As InternetExplorer
Dim lastRow As Long, wks As Worksheet, i As Long, ws As Worksheet
Set ie = New SHDocVw.InternetExplorer 'SetBrowser
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set wks = ActiveSheet '<==use explicit sheet name if possible
lastRow = wks.Cells(wks.rows.Count, "A").End(xlUp).Row
loopvalues = Application.Transpose(wks.Range("A2:A" & lastRow).Value)
With ie
.Visible = False
.Navigate2 "URL" 'Open website
While .Busy Or .readyState < 4: DoEvents: Wend
.document.getElementById("tbxUserID").Value = InputBox("Please Enter Your ID")
.document.getElementById("txtPassword").Value = InputBox("Please Enter Your Password")
.document.getElementById("BtnLogin").Click
While .Busy Or ie.readyState < 4: DoEvents: Wend
Dim newURL As String, val1 As String, val2 As String
newURL = .document.URL
For i = LBound(loopvalues) To UBound(loopvalues)
.document.getElementById("txtField1").Value = loopvalues(i)
.document.getElementById("CtrlQuickSearch1_imgBtnSumbit").Click
While .Busy Or .readyState < 4: DoEvents: Wend
val1 = .document.querySelectorAll("span")(33).innerText
ws.Range("B" & i).Value = val1
val2 = .document.querySelectorAll("span")(35).innerText
ws.Range("C" & i).Value = val2
.Navigate2 newURL
While .Busy Or ie.readyState < 4: DoEvents: Wend
Next
.Quit
End With
End Sub

Excel VBA - Extracting data from web page

I am trying to pull seller information from Amazon page with price by automating web browser. I am trying to run the below code, but the error I am getting is:
Object Variable or With Block variable not set.
Can someone guide me where i am going wrong.
Option Explicit
Sub RunNewModule()
Dim ie As InternetExplorer
Dim html As HTMLDocument
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = False
ie.Navigate "http://www.amazon.com/gp/offer-listing/B00SVA81Z2/ref=dp_olp_new_mbc?ie=UTF8&condition=new"
Dim priceData As Variant
Dim sellerdata As Variant
Dim item As Variant
Dim cntr As Integer
priceData = html.getElementsByClassName("olpOfferPrice").getElementsByTagName("span")(0).innerText
cntr = 1
For Each item In priceData
Range("B" & cntr) = item.innerText
cntr = cntr + 1
Next item
sellerdata = html.getElementsByClassName("olpSellerName").getElementsByTagName("span")(0).innerText
cntr = 1
For Each item In sellerdata
Range("A" & cntr) = item.innerText
cntr = cntr + 1
Next item
End Sub
You didn't assign html and it's null now.
You should assign it this way:
Set html= ie.Document
To get an element by it's class name:
Dim ie As InternetExplorer
Dim html As IHTMLDocument
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = False
ie.Navigate "http://stackoverflow.com/questions/34463544/vba-fetching-data-from-class-name"
While ie.Busy
DoEvents
Wend
While ie.ReadyState < 4
DoEvents
Wend
Set html = ie.Document
Dim elements As IHTMLElementCollection
Set elements = html.getElementsByClassName("question-hyperlink")
If elements.Length > 0 Then
MsgBox elements(0).innerText
End If
ie.Quit
Set ie = Nothing
Don't forget to add reference to:
Microsoft Internet Controls
Microsoft Html Object library
For that amazon link:
Dim ie As InternetExplorer
Dim html As HTMLDocument
Set ie = CreateObject("InternetExplorer.Application")
ie.Visible = False
ie.Navigate "http://www.amazon.in/gp/offer-listing/B00EYCBGNA/ref=dp_olp_new_mbc?ie=UTF8&condition=new"
While ie.Busy
DoEvents
Wend
While ie.ReadyState < 4
DoEvents
Wend
Set html = ie.Document
Dim elements As IHTMLElementCollection
Set elements = html.getElementsByClassName("olpOfferPrice")
For i = 0 To elements.Length - 1
Sheet1.Range("A" & (i + 1)) = elements(i).innerText
Next i
Set elements = html.getElementsByClassName("olpSellerName")
For i = 0 To elements.Length - 1
Sheet1.Range("B" & (i + 1)) = elements(i).innerText
Next i
ie.Quit
Set ie = Nothing

Resources