Scraping the Business name using Web class="search-item-header"

Scraping the Business name using Web class="search-item-header" - excel

I am trying to extract the Business Name from a website.
I am receiving an error.
e]1
On For iCnt = 0 To .getElementsByTagName("h2").Length - 1
I need to extract all the details available in the website like:
Business Name
Address
Telephone
Fax
Email
Website
in a sequence so all the details can be pasted into an Excel file.
Option Explicit
Const sSiteName = "https://www.thoroughexamination.org/postcode-search/nationwide?page=1"
Private Sub getHTMLContents()
' Create Internet Explorer object.
Dim IE As Object
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = False ' Keep this hidden.
IE.Navigate sSiteName
' Wait till IE is fully loaded.
While IE.ReadyState <> 4
DoEvents
Wend
Dim oHDoc As HTMLDocument ' Create document object.
Set oHDoc = IE.Document
Dim oHEle As HTMLUListElement ' Create HTML element (<ul>) object.
Set oHEle = oHDoc.getElementById("search-item-header") ' Get the element reference using its ID.
Dim iCnt As Integer
' Loop through elements inside the <ul> element and find <h1>, which has the texts we want.
With oHEle
For iCnt = 0 To .getElementsByTagName("h2").Length - 1
Debug.Print .getElementsByTagName("h2").Item(iCnt).getElementsByTagName("a").Item(0).innerHTML
Next iCnt
End With
' Clean up.
IE.Quit
Set IE = Nothing
Set oHEle = Nothing
Set oHDoc = Nothing
End Sub
No Response from 2nd Code:
Sub TutorailsPoint()
Const URL = "https://www.thoroughexamination.org/postcode-search/nationwide?page=1"
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument
Dim topics As Object, posts As Object, topic As Object
Dim x As Long
x = 1
http.Open "GET", URL, False
http.send
html.body.innerHTML = http.responseText
Set topics = html.getElementsByClassName("search-item-header")
For Each posts In topics
For Each topic In posts.getElementsByTagName("h2")
Cells(x, 1) = topic.innerText
x = x + 1
Next topic
Next posts
End Sub

You can do what you want with xhr if you use an UserAgent. Here is a code for all datasets per row with the present fields for all pages of your posted url.
Sub TutorailsPoint()
Dim doc As Object
Dim url As String
Dim page As Long
Dim hits As Long
Dim maxPage As Long
Dim maxPageKnown As Boolean
Dim currRow As Long
Dim nodeAllGroups As Object
Dim nodeOneGroup As Object
Dim nodeContactData As Object
Dim nodeWebSite As Object
Dim telephone As Boolean
page = 1
maxPage = 1
currRow = 2
Set doc = CreateObject("htmlFile")
With CreateObject("MSXML2.ServerXMLHTTP.6.0")
'Call all pages
Do
url = "https://www.thoroughexamination.org/postcode-search/nationwide?page=" & page
.Open "GET", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0"
.send
If .Status = 200 Then
doc.body.innerHTML = .responseText
'How many pages to call
If Not maxPageKnown Then
hits = CLng(doc.getElementsByClassName("summary")(0).getElementsByTagName("b")(1).innertext)
maxPage = hits / 20
If hits Mod 20 <> 0 Then
maxPage = maxPage + 1
End If
maxPageKnown = True
'Debug.Print maxPage
End If
Set nodeAllGroups = doc.getElementsByClassName("group")
For Each nodeOneGroup In nodeAllGroups
'Business name
Cells(currRow, 1) = nodeOneGroup.getElementsByTagName("h2")(0).innertext
'Address
Cells(currRow, 2) = nodeOneGroup.getElementsByTagName("p")(0).innertext
'Contact block
Set nodeContactData = nodeOneGroup.getElementsByClassName("depot")
If nodeContactData.Length <> 0 Then
'Telephone
If InStr(1, nodeContactData(0).innertext, "tel:") > 0 Then
Cells(currRow, 3).NumberFormat = "#"
Cells(currRow, 3) = Trim(nodeContactData(0).getElementsByTagName("strong")(0).innertext)
telephone = True
End If
'Fax
If InStr(1, nodeContactData(0).innertext, "fax:") > 0 Then
Cells(currRow, 4).NumberFormat = "#"
If telephone Then
Cells(currRow, 4) = Trim(Replace(nodeContactData(0).getElementsByTagName("p")(0).FirstChild.NextSibling.NextSibling.NextSibling.NodeValue, "fax:", ""))
Else
Cells(currRow, 4) = Trim(Replace(nodeContactData(0).getElementsByTagName("p")(0).FirstChild.NodeValue, "fax:", "")) 'not sure, not seen no telephone
End If
End If
'Email
If InStr(1, nodeContactData(0).innertext, "email:") > 0 Then
Cells(currRow, 5) = Trim(nodeContactData(0).getElementsByTagName("a")(0).innertext)
End If
'website
Set nodeWebSite = nodeContactData(0).getElementsByClassName("website")
If nodeWebSite.Length > 0 Then
Cells(currRow, 6) = Trim(nodeWebSite(0).innertext)
End If
End If
telephone = False
currRow = currRow + 1
Next nodeOneGroup
page = page + 1
Else
Cells(currRow, 1) = "Page not loaded. HTTP status " & .Status
Cells(currRow, 2) = url
currRow = currRow + 1
End If
Loop While page <= maxPage
End With
End Sub

Internet Explorer was dropped by MS so it's not a good idea to use it. From now on, in VBA, my best option is to use Selenium (Selenium Type Library) to scrape the WWW. To get started with Selenium the right way you have to take care of the following:
Update the related .Net Framework
Install Selenium Basic app
Download and install the version of Chromedriver.exe (see the latest version here) exactly compatible with the version of Google Chrome (which must be installed on the machine). Extract 'Chromedriver.exe' and put it at the same folder where Selenium Basic was installed (I've installed Selenium Basic here: 'C:\Program Files\SeleniumBasic')
Add the reference in the VBE to the ActiveX library: 'Selenium Type Library'
Put the code in a standard module on the VBE:
Sub fnGetDataFromWWW()
Dim oWD As WebDriver
Dim post As Selenium.WebElement
Dim groups As Selenium.WebElements
Dim strText As String
Dim intItem As Integer
Set oWD = New Selenium.WebDriver
oWD.Start "chrome"
DoEvents
oWD.Get "https://www.thoroughexamination.org/postcode-search/nationwide?page=1"
DoEvents
Set groups = oWD.FindElementsByClass("group")
For Each post In groups
strText = post.Attribute("outerText")
For intItem = 0 To UBound(Split(strText, Chr(10)))
If Trim(Split(strText, Chr(10))(intItem)) <> "" Then
Debug.Print Split(strText, Chr(10))(intItem)
End If
Next
Next post
End Sub

Related

Need to add a Regxp to Bulk Email Extractor. - excel VBA

++++++++++++++ I have added new stuff at the bottom today, +++++++++++++++++
#############################################################################
+++++++++++ Edit today 7th Jan 2021 ######## 3:50PM UK TIME +++++++++++
#############################################################################
This is a few days old and I have abandonded it, unless someone can help, for new information please go to the bottom,
I have been attempting to update my code to extract emails using a REGXP rather that Mailto: which it currently used. I posted on S.O a few months back asking for help, which I did receive and I am very grateful for. The new code was much longer than mine and more complicated for me to understand. I a very limited in VBA and was unable to make further changes to the new code. For the past few month I have been trying to update my existing code, with no luck.
As advised I am very limited in VBA, however I do understand my code and would like a MINOR change in it to enable me to extract emails via Regxp. Please DO NOT re-write the whole code, As this was done once and is out of my depth.
This is My Original Code
Sub ScrapeSoMeAndMailAddresses()
'Columns for both tables
Const colUrl As Long = 1 'Must always be the first column
Const colMail As Long = 2 'Must always be the first column before Some platforms
Const colFacebook As Long = 3 'Must always be the last column of Some platforms
Const colError As Long = 4 'Must always be the last column
Dim url As String
Dim http As Object
Dim htmlDoc As Object
Dim nodeAllLinks As Object
Dim nodeOneLink As Object
Dim pageLoadSuccessful As Boolean
Dim tableUrlsOneAddressLeft As String
Dim tableAllAddresses As String
Dim currentRowTableUrls As Long
Dim lastRowTableUrls As Long
Dim currentRowsTableAll(colUrl To colFacebook) As Long
Dim lastRowTableAll As Long
Dim addressCounters(colMail To colFacebook) As Long
Dim checkCounters As Long
'Initialize variables
tableUrlsOneAddressLeft = "Urls" ''Name of Sheet
currentRowTableUrls = 2 'First row for content
tableAllAddresses = "Results" ''Name of Sheet
For checkCounters = colUrl To colFacebook
currentRowsTableAll(checkCounters) = 2 'First rows for content
Next checkCounters
Set htmlDoc = CreateObject("htmlfile")
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
'Clear all contents and comments in the URL source sheet from email column to error column
With Sheets(tableUrlsOneAddressLeft)
lastRowTableUrls = .Cells(Rows.Count, colUrl).End(xlUp).Row
.Range(.Cells(currentRowTableUrls, colMail), .Cells(lastRowTableUrls, colError)).ClearContents
.Range(.Cells(currentRowTableUrls, colMail), .Cells(lastRowTableUrls, colError)).ClearComments
End With
'Delete all rows except headline in the sheet with all addresses
lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.Count, colUrl).End(xlUp).Row
Sheets(tableAllAddresses).Rows(currentRowsTableAll(colUrl) & ":" & lastRowTableAll).Delete Shift:=xlUp
'Loop over all URLs in column A in the URL source sheet
Do While Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Value <> ""
'Scroll for visual monitoring, if 'the sheet with the URLs are the
'active one
If ActiveSheet.Name = tableUrlsOneAddressLeft Then
If currentRowTableUrls > 14 Then
ActiveWindow.SmallScroll down:=1
End If
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, 1).Select
End If
'Get next url from the URL source sheet
url = Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colUrl).Value
'Try to load page 'Temporarily disable error handling if 'there is a timeout or onother error
On Error Resume Next
http.Open "GET", url, False
http.send
'Check if page loading was successful
If Err.Number = 0 Then
pageLoadSuccessful = True
End If
On Error GoTo 0
If pageLoadSuccessful Then
'Build html document for DOM operations
htmlDoc.body.innerHtml = http.responseText
'Create node list from all links of the page
Set nodeAllLinks = htmlDoc.getElementsByTagName("a")
'Walk through all links of the node list
For Each nodeOneLink In nodeAllLinks
'''#####################################################################################################
'''################################### THIS IS THE START OF THE EMAIL SECTION ##########################
'''#####################################################################################################
'Check for mail address
If InStr(1, nodeOneLink.href, "mailto:") Then
'Write mail address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colMail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colMail), colMail).Value = Right(nodeOneLink.href, Len(nodeOneLink.href) - InStr(nodeOneLink.href, ":"))
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colMail) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment mail counters
currentRowsTableAll(colMail) = currentRowsTableAll(colMail) + 1
addressCounters(colMail) = addressCounters(colMail) + 1
End If
'''#####################################################################################################
'''################################### END OF THE EMAIL SECTION ########################################
'''#####################################################################################################
'Check for Facebook address
If InStr(1, UCase(nodeOneLink.href), "FACEBOOK") Then
'Write Facebook address to both tables
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colFacebook).Value = nodeOneLink.href
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colFacebook), colFacebook).Value = nodeOneLink.href
'Check if it is a new line in the sheet with all addresses
If currentRowsTableAll(colFacebook) >= currentRowsTableAll(colUrl) Then
'Write URL in the new line of the sheet with all addresses
Sheets(tableAllAddresses).Cells(currentRowsTableAll(colUrl), colUrl).Value = url
'Increment url counter
currentRowsTableAll(colUrl) = currentRowsTableAll(colUrl) + 1
End If
'Increment Facebook counters
currentRowsTableAll(colFacebook) = currentRowsTableAll(colFacebook) + 1
addressCounters(colFacebook) = addressCounters(colFacebook) + 1
End If
Next nodeOneLink
'Check address counters
For checkCounters = colMail To colFacebook
'Set comment if more than 1 link were found
If addressCounters(checkCounters) > 1 Then
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).AddComment Text:=CStr(addressCounters(checkCounters))
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, checkCounters).Comment.Shape.TextFrame.AutoSize = True
End If
Next checkCounters
Else
'Page not loaded
'Write message URL table
Sheets(tableUrlsOneAddressLeft).Cells(currentRowTableUrls, colError).Value = "Error with URL or timeout"
End If
'Prepare for next page
pageLoadSuccessful = False
Erase addressCounters
lastRowTableAll = Sheets(tableAllAddresses).Cells(Rows.Count, colUrl).End(xlUp).Row
For checkCounters = colUrl To colFacebook
currentRowsTableAll(checkCounters) = lastRowTableAll + 1 'First rows for next page content
Next checkCounters
currentRowTableUrls = currentRowTableUrls + 1
Loop
'Clean up
Set http = Nothing
Set htmlDoc = Nothing
Set nodeAllLinks = Nothing
Set nodeOneLink = Nothing
End Sub
This is the Original Post on S.O with the new code that is out of my depth.
My Post on S.O
In the above link there is the new code and a regxp in it, however I can not adapt this code to work with mine. If its RegXP function can be used and adapted with my code, then that would be super.
I have also tried to use this Regxp within my code, but with NO luck. In my above code I ONLY NEED THE EMAIL part updating and NOT the whole code re-written.
For Each link In links
'Set doc = NewHTMLDocument(CStr(link))
Set Html = NewHTMLDocument(CStr(link))
With regxp
''' ########## Phone Numbers Pattern ###########
.Pattern = "(?:\+1)?(?:\+[0-9])?\(?([0-9]{4})\)?[-. ]?([0-9]{4})[-. ]?([0-9]{3}?)" '"(?:\+1)?(?:\+[0-9])?\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{3}?)"
.Global = False
.IgnoreCase = True
Set phone_list = .Execute(Html.body.innerHtml)
''' ########## Email Pattern ###########
.Pattern = "([a-zA-Z0-9_\-\.]+)#((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"
.Global = False
.IgnoreCase = True
Set email_list = .Execute(Html.body.innerHtml)
'######## Edit today 6th Jan 2021 ######## 6:00PM UK TIME
if I change this
If InStr(1, nodeOneLink.href, "mailto:") Then
To This
If InStr(1, nodeOneLink.href, "#") Then
Then I get slightly better results, Ideally I would like a regxp, if any one could help that would be super as I am stuck on this and can not go any futher
##################################################################
'######## Edit today 6th Jan 2021 ######## 8:00PM UK TIME
###################################################################
I was advised that this is the regxp that I need
Private Function GetEmailAddressesFromHtml(ByVal htmlDocument As Object) As Collection
' Should return a collection of strings representing email addresses detected
' in the HTML document.
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim regEx As Object
Set regEx = CreateObject("VBScript.RegExp")
With regEx
.Pattern = "[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
.Global = True
Dim emailMatches As Object
Set emailMatches = .Execute(htmlDocument.body.innerHTML)
End With
Dim matchFound As Object
For Each matchFound In emailMatches
On Error Resume Next ' De-duplicate here.
outputCollection.Add matchFound.Value, Key:=matchFound.Value
On Error GoTo 0
Next matchFound
Set GetEmailAddressesFromHtml = outputCollection
End Function
And that it needs to go here
Dim colEmails as collection
Set colEmails = GetEmailAddressesFromHtml(htmlDoc)
If colEmails.Length >0 then
*****ADD YOUR CODE HERE****
End If
However I have been trying for months now and can not work it out. If someone can advise that would be super.
#############################################################################
+++++++++++++Edit today 7th Jan 2021 ######## 3:50PM UK TIME ++++++++++++
#############################################################################
I have simplified the code, so now I have the regxp for emails and phones, however I can not workout the part on how to add the code to extract social media links.
New Code
Private Sub CommandButton1_Click()
Dim wb As Workbook
Dim wsSheet As Worksheet, links As Variant, link As Variant
Dim rw As Long
Dim regxp As New RegExp, post As Object, phone_list As Object, email_list As Object
Dim Html As New HTMLDocument
''''SHEET1 as sheet with URL
Set wb = ThisWorkbook
Set wsSheet = wb.Sheets("Sheet1")
rw = wsSheet.Cells(wsSheet.Rows.Count, "A").End(xlUp).Row
links = wsSheet.Range("A2:A" & rw)
For Each link In links
'Set doc = NewHTMLDocument(CStr(link))
Set Html = NewHTMLDocument(CStr(link))
With regxp
''' ########## Phone Numbers Pattern ###########
.Pattern = "(?:\+1)?(?:\+[0-9])?\(?([0-9]{4})\)?[-. ]?([0-9]{4})[-. ]?([0-9]{3}?)" '"(?:\+1)?(?:\+[0-9])?\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{3}?)"
.Global = False
.IgnoreCase = True
Set phone_list = .Execute(Html.body.innerHtml)
''' ########## Email Pattern ###########
.Pattern = "([a-zA-Z0-9_\-\.]+)#((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"
.Global = False
.IgnoreCase = True
Set email_list = .Execute(Html.body.innerHtml)
'''########## PHONE LIST ############# ADD TO SHEET
On Error Resume Next
If phone_list(0) Is Nothing Then
On Error Resume Next
Sheet1.Cells(Sheet1.Cells(Sheet1.Rows.Count, "B").End(xlUp).Row + 1, "B").Value = "-"
Else
On Error Resume Next
Sheet1.Cells(Sheet1.Cells(Sheet1.Rows.Count, "B").End(xlUp).Row + 1, "B").Value = phone_list(0)
End If
'''########## EMAIL LIST ############# ADD TO SHEET
On Error Resume Next
If email_list(0) Is Nothing Then
On Error Resume Next
Sheet1.Cells(Sheet1.Cells(Sheet1.Rows.Count, "C").End(xlUp).Row + 1, "C").Value = "-"
Else
On Error Resume Next
Sheet1.Cells(Sheet1.Cells(Sheet1.Rows.Count, "C").End(xlUp).Row + 1, "C").Value = email_list(0)
End If
End With
''''navigate links
Next link
End Sub
Public Function NewHTMLDocument(strURL As String) As Object
Dim objHTTP As Object, objHTML As Object, strTemp As String
Set objHTTP = CreateObject("MSXML2.ServerXMLHTTP")
objHTTP.Open "GET", strURL, False
objHTTP.send
If objHTTP.Status = 200 Then
strTemp = objHTTP.responseText
Set objHTML = CreateObject("htmlfile")
objHTML.body.innerHtml = strTemp
Set NewHTMLDocument = objHTML
Else
'There has been an error
End If
End Function
I need something like this added to the above code
Html.body.innerHTML = http.responseText
Set links = Html.getElementsByTagName("a")
'''COLUMN D = TWITTER
For Each link In links
If InStr(UCase(link.outerHTML), "FACEBOOK") Then
website.Offset(0, 3).Value = link.href
End If
'''COLUMN E = TWITTER
If InStr(UCase(link.outerHTML), "TWITTER") Then
website.Offset(0, 4).Value = link.href
End If
Next
This is the Full Code from where the above code snip is from. I do not need all of what is below, only the part to extract social media links. I have also posed here My Post On Mr Excel
Dim counter As Long
Dim website As Range
Dim row As Long
Dim continue As Boolean
Dim respHead As String
''''The row where website addresses start
row = 2
continue = True
Set http = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Do While continue
'''Could set this to first cell with URL then OFFSET columns to get next web site
Set website = ThisWorkbook.Worksheets("Sheet1").Range("A" & row)
If Len(website.Value) < 1 Then
continue = False
Exit Sub
End If
If website Is Nothing Then
continue = False
End If
'''Debug.Print website
With http
On Error Resume Next
.Open "GET", website.Value, False
.send
'''If the website sent a valid response to our request, URLS ARE IN COLUMN A
If Err.Number = 0 Then
If .Status = 200 Then
Html.body.innerHTML = http.responseText
Set links = Html.getElementsByTagName("a")
'''COLUMN D = TWITTER
For Each link In links
If InStr(UCase(link.outerHTML), "FACEBOOK") Then
website.Offset(0, 3).Value = link.href
End If
'''COLUMN E = TWITTER
If InStr(UCase(link.outerHTML), "TWITTER") Then
website.Offset(0, 4).Value = link.href
End If
Next
End If
Set website = Nothing
Else
'''Debug.Print "Error loading page IN COLUMN H"
website.Offset(0, 8).Value = "Error with website address"
End If
On Error GoTo 0
End With
row = row + 1
Loop
Complete.Show '#### THIS FORM DOES NOT SHOW AT THE END ####
''' CLOSE BROWSER
IE.Quit
Set IE = Nothing
Set ElementCol = Nothing

Excel VBA data scrape - not all data being pulled

I posted a similar question recently, but I've been able to successfully scrape some data from webpages since then. However, I've run into a problem. When trying to scrape from this page: https://www.scpcn.ca/clinics
I'm trying to get addresses and clinic names, however, when I use this code I only get the first page. It's also outputting two of each clinics on the first page but no addresses.
Even weirder is that it worked once, pulling all the clinics and about half the addresses, when I deleted this and tried to run again I only got the first 10 clinics and that's where its at now.
I thought the issue was that the page wasn't waiting long enough so I added a wait timer but that didn't seem to do anything.
Option Explicit
Sub GetSouthClinicData()
Dim objIE As InternetExplorer
Dim clinicEle As Object
Dim clinicAdd As Object
Dim clinicName As String
Dim clinicAddress As String
Dim y As Integer
Dim x As Integer
Set objIE = New InternetExplorer
objIE.Visible = True
objIE.Navigate "https://www.scpcn.ca/clinics"
While objIE.Busy = True Or objIE.readyState <> 4: DoEvents: Wend
Application.Wait (Now + TimeValue("0:00:5"))
y = 2
For Each clinicEle In objIE.document.getElementsByClassName("clinic-title")
clinicName = clinicEle.getElementsByTagName("a")(0).innerText
Sheets("Sheet3").Range("A" & y).Value = clinicName
y = y + 1
Next
x = 2
For Each clinicEle In objIE.document.getElementsByClassName("toggle-address clinic-address")
clinicAddress = clinicEle.getElementsByTagName("br")(0).innerText
Sheets("Sheet3").Range("B" & x).Value = clinicAddress
x = x + 1
Next
objIE.Quit
End Sub

Try the following which returns nodeLists based on class and then indexing in
Option Explicit
Public Sub GetInfo()
Dim html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.scpcn.ca/clinics", False
.send
html.body.innerHTML = .responseText
Dim names As Object, addresses As Object, i As Long
Set names = html.querySelectorAll(".clinic-title")
Set addresses = html.querySelectorAll(".clinic-address")
For i = 0 To names.Length - 1
With ActiveSheet
.Cells(i + 1, 1) = names.item(i).innerText
.Cells(i + 1, 2) = addresses.item(i).innerText
End With
Next
End With
End Sub

Web scraping with getElementsByTagName()

I want to import restaurant data like Restaurant name, phone number, website & address to excel but unfortunately I am getting ads & garbage data. I have created a code using http://automatetheweb.net/vba-getelementsbytagname-method/ website but it is not helping out. Please rectify the issue in my code.
Website:https://www.yellowpages.com/atlanta-ga/attorneys
Please donot refer json as it is not working on other webs.
Sub Yellowcom()
'Dim ieObj As InternetExplorer
Dim htmlELe As IHTMLElement
Dim HTML As HTMLDocument
Dim i As Integer
Dim URL As String
Dim URLParameter As String
Dim page As Long
Dim links As Object
Dim IE As Object
i = 1
Set IE = CreateObject("InternetExplorer.Application")
'Set ieObj = New InternetExplorer
IE.Visible = True
URL = "https://www.yellowpages.com/atlanta-ga/attorneys"
'Application.Wait Now + TimeValue("00:00:05")
For page = 2 To 4
If page > 1 Then URLParameter = "?page=" & page
IE.navigate URL & URLParameter
' Wait for the browser to load the page
Do Until IE.readyState = 4
DoEvents
Loop
Set HTML = IE.document
Set links = HTML.getElementsByClassName("info")
For Each htmlELe In links
With ActiveSheet
.Range("A" & i).Value = htmlELe.Children(0).textContent
.Range("B" & i).Value = htmlELe.getElementsByTagName("a")(0).href
.Range("C" & i).Value = htmlELe.Children(2).textContent
.Range("D" & i).Value = htmlELe.Children(2).querySelector("a[href]")
'links2 = htmlELe.getElementsByClassName("links")(1)
' .Range("D" & i).Value = links2.href
End With
i = i + 1
Next htmlELe
Next page
IE.Quit
Set IE = Nothing
End Sub
Required Output should be like this

I would use xhr rather than a browser and store data in an array for each page and write that out to sheet. You could really dimension one array to hold all results in advance based on results per page and number of pages but the below is still efficient
Option Explicit
Public Sub GetListings()
Dim html As HTMLDocument, page As Long, html2 As HTMLDocument
Dim results As Object, headers(), ws As Worksheet, i As Long
Const START_PAGE As Long = 1
Const END_PAGE As Long = 2
Set ws = ThisWorkbook.Worksheets("Sheet1")
headers = Array("Name", "Phone", "Website", "Address")
Application.ScreenUpdating = False
Set html = New HTMLDocument
Set html2 = New HTMLDocument
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
With CreateObject("MSXML2.XMLHTTP")
For page = START_PAGE To END_PAGE
.Open "GET", "https://www.yellowpages.com/atlanta-ga/attorneys?page=" & page, False
.send
html.body.innerHTML = .responseText
Set results = html.querySelectorAll(".organic .result")
Dim output(), r As Long
ReDim output(1 To results.Length, 1 To 4)
r = 1
For i = 0 To results.Length - 1
On Error Resume Next
html2.body.innerHTML = results.item(i).outerHTML
output(r, 1) = html2.querySelector(".business-name").innerText
output(r, 2) = html2.querySelector(".phone").innerText
output(r, 3) = html2.querySelector(".track-visit-website").href
output(r, 4) = html2.querySelector(".street-address").innerText & " " & html2.querySelector(".locality").innerText
On Error GoTo 0
r = r + 1
Next
ws.Cells(GetLastRow(ws, 1) + 1, 1).Resize(UBound(output, 1), UBound(output, 2)) = output
Next
End With
Application.ScreenUpdating = True
End Sub
Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
With ws
GetLastRow = .Cells(.rows.Count, columnNumber).End(xlUp).Row
End With
End Function
Sample of output:

The info class is also used for the advertisements. You first need to go to the collection where the classname is "search-results organic" and in there find all the "info" classes.
This means that you need an extra collection variable:
Set HTML = IE.document
Set OrganicLinks = HTML.getElementsByClassName("search-results organic")
Set links = OrganicLinks.item(0).getElementsByClassName("info")
For getting the right website, you need to use another reference. It's better to get it by classname, since that one is more unique:
On Error Resume Next
.Range("B" & i).Value = htmlELe.getElementsByClassName("track-visit-website")(0).href
On Error GoTo 0

VBA HTML Listing Info Pull

I am looking to follow a series of URL's that are found in column A (example: https://www.ebay.com/itm/Apple-iPhone-7-GSM-Unlocked-Verizon-AT-T-TMobile-Sprint-32GB-128GB-256GB/352381131997?epid=225303158&hash=item520b8d5cdd:m:mWgYDe4a79NeLuAlV-RmAQA:rk:7:pf:0) and pull the following information from them:
- Title
- Price
- Description
I think there are multiple issues with my code... For one, I can't get the program to follow specific URL's listed in the Excel (only if I specify one within the code). Also, pulling multiple fields has given me issues.
Option Explicit
Public Sub ListingInfo()
Dim ie As New InternetExplorer, ws As Worksheet, t As Date
Dim i As Integer
i = 0
Do While Worksheets("Sheet1").Cells(i, 1).Value <> ""
Const MAX_WAIT_SEC As Long = 10
Set ws = ThisWorkbook.Worksheets("Sheet1")
With ie
.Visible = True
.Navigate2 Worksheets("Sheet1").Cells(i, 1).Value
While .Busy Or .readyState < 4: DoEvents: Wend
Dim Links As Object, i As Long, count As Long
t = Timer
Do
On Error Resume Next
Set Title = .document.querySelectorAll("it-ttl")
Set price = .document.querySelectorAll("notranslate")
Set Description = .document.querySelectorAll("ds_div")
count = Links.Length
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While count = 0
For i = 0 To Title.Length - 1
ws.Cells(i + 1, 1) = Title.item(i)
ws.Cells(i + 1, 2) = price.item(i)
ws.Cells(i + 1, 3) = Description.item(i)
Next
.Quit
i = i + 1
Loop
End With
End Sub

I would use late binding for MSXML2.XMLHTTP and set a reference to the Microsoft HTML Object Library for the HTMLDocument.
Note: querySelector() references the first item it finds that matches its search string.
Here is the short version:
Public Sub ListingInfo()
Dim cell As Range
With ThisWorkbook.Worksheets("Sheet1")
For Each cell In .Range("A1", .Cells(.Rows.Count, 1).End(xlUp))
Dim Document As MSHTML.HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", cell.Value, False
.send
Set Document = New MSHTML.HTMLDocument
Document.body.innerHTML = .responseText
End With
cell.Offset(0, 1).Value = Document.getElementByID("itemTitle").innerText
cell.Offset(0, 2).Value = Document.getElementByID("prcIsum").innerText
If Not Document.querySelector(".viSNotesCnt") Is Nothing Then
cell.Offset(0, 3).Value = Document.querySelector(".viSNotesCnt").innerText
Else
'Try Something Else
End If
Next
End With
End Sub
A more elaborate solution would be to break the code up into smaller routines and load the data into an Array. The main advantage of this is that you can test each subroutine separately.
Option Explicit
Public Type tListingInfo
Description As String
Price As Currency
Title As String
End Type
Public Sub ListingInfo()
Dim source As Range
Dim data As Variant
With ThisWorkbook.Worksheets("Sheet1")
Set source = .Range("A1:D1", .Cells(.Rows.count, 1).End(xlUp))
data = source.Value
End With
Dim r As Long
Dim record As tListingInfo
Dim url As String
For r = 1 To UBound(data)
record = getListingInfo()
url = data(r, 1)
record = getListingInfo(url)
With record
data(r, 2) = .Description
data(r, 3) = .Price
data(r, 4) = .Title
End With
Next
source.Value = data
End Sub
Public Function getListingInfo(url As String) As tListingInfo
Dim ListingInfo As tListingInfo
Dim Document As MSHTML.HTMLDocument
Set Document = getHTMLDocument(url)
With ListingInfo
.Description = Document.getElementByID("itemTitle").innerText
.Price = Split(Document.getElementByID("prcIsum").innerText)(1)
.Title = Document.querySelectorAll(".viSNotesCnt")(0).innerText
Debug.Print .Description, .Price, .Title
End With
End Function
Public Function getHTMLDocument(url As String) As MSHTML.HTMLDocument
Const READYSTATE_COMPLETE As Long = 4
Dim Document As MSHTML.HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", url, False
.send
If .readyState = READYSTATE_COMPLETE And .Status = 200 Then
Set Document = New MSHTML.HTMLDocument
Document.body.innerHTML = .responseText
Set getHTMLDocument = Document
Else
MsgBox "URL: " & vbCrLf & "Ready state: " & .readyState & vbCrLf & "HTTP request status: " & .Status, vbInformation, "URL Not Responding"
End If
End With
End Function

There are a lot of things to fix in your code. It is late here so I will just give pointers (and update fully later) and working code below:
Declare all variables and use appropriate type
Review For Loops and how transpose can be used to create a 1d array of urls pulled from sheet to loop over
Review the difference between querySelector and querySelectorAll methods
Review CSS selectors (you are specifying everything as type selector when in fact you are not selecting by tag for the elements of interest; nor by your stated text)
Think about placement of your IE object creation and of your .Navigate2 to make use of existing object
Make sure to use distinct loop counters
Be sure not to overwrite values in sheet
Code:
Option Explicit
Public Sub ListingInfo()
Dim ie As New InternetExplorer, ws As Worksheet
Dim i As Long, urls(), rowCounter As Long
Dim title As Object, price As Object, description As Object
Set ws = ThisWorkbook.Worksheets("Sheet1")
urls = Application.Transpose(ws.Range("A1:A2").Value) '<= Adjust
With ie
.Visible = True
For i = LBound(urls) To UBound(urls)
If InStr(urls(i), "http") > 0 Then
rowCounter = rowCounter + 1
.Navigate2 urls(i)
While .Busy Or .readyState < 4: DoEvents: Wend
Set title = .document.querySelector(".it-ttl")
Set price = .document.querySelector("#prcIsum")
Set description = .document.querySelector("#viTabs_0_is")
ws.Cells(rowCounter, 3) = title.innerText
ws.Cells(rowCounter, 4) = price.innerText
ws.Cells(rowCounter, 5) = description.innerText
Set title = Nothing: Set price = Nothing: Set description = Nothing
End If
Next
.Quit
End With
End Sub

Here's an approach using Web Requests, using MSXML. It should be significantly faster than using IE, and I'd encourage you to strongly consider using this approach wherever possible.
You'll need references to Microsoft HTML Object Library and Microsoft XML v6.0 to get this working.
Option Explicit
Public Sub SubmitRequest()
Dim URLs As Excel.Range
Dim URL As Excel.Range
Dim LastRow As Long
Dim wb As Excel.Workbook: Set wb = ThisWorkbook
Dim ws As Excel.Worksheet: Set ws = wb.Worksheets(1)
Dim ListingDetail As Variant
Dim i As Long
Dim j As Long
Dim html As HTMLDocument
ReDim ListingDetail(0 To 2, 0 To 10000)
'Get URLs
With ws
LastRow = .Cells(.Rows.Count, 1).End(xlUp).Row
Set URLs = .Range(.Cells(1, 1), .Cells(LastRow, 1))
End With
'Update the ListingDetail
For Each URL In URLs
Set html = getHTML(URL.Value2)
ListingDetail(0, i) = html.getElementByID("itemTitle").innertext 'Title
ListingDetail(1, i) = html.getElementByID("prcIsum").innertext 'Price
ListingDetail(2, i) = html.getElementsByClassName("viSNotesCnt")(0).innertext 'Seller Notes
i = i + 1
Next
'Resize array
ReDim Preserve ListingDetail(0 To 2, 0 To i - 1)
'Dump in Column T,U,V of existing sheet
ws.Range("T1:V" & i).Value = WorksheetFunction.Transpose(ListingDetail)
End Sub
Private Function getHTML(ByVal URL As String) As HTMLDocument
'Add a reference to Microsoft HTML Object Library
Set getHTML = New HTMLDocument
With New MSXML2.XMLHTTP60
.Open "GET", URL
.send
getHTML.body.innerHTML = .responseText
End With
End Function

Pulling Data from an embedded web page with VBA Excel

I am trying to use VBA in Excel to access data in a webpage that is embedded in a webpage. I know how to do this if the table is on a non-embedded page. I also know how to navigate to this product's page using VBA. I cannot just navigate to the embedded page because there is a product id look up that converts a part number to an id and i don't have access to that database.
Here is the link to the page: http://support.automation.siemens.com/WW/view/en/7224052
I would have posed a picture of the element for clarity but I don't have 10 rep points...
The table I need to get information from is the "Product Life Cycle" table.
I can see the correct url in a property called src under the corresponding item if I save the page as an HTMLDocument in VBA using the following code:
For Each cell In Selection
link = "http://support.automation.siemens.com/US/llisapi.dll?func=cslib.csinfo&lang=en&objid=" & cell & "&caller=view"
ie.navigate link
Do
DoEvents
Loop Until ie.readyState = READYSTATE_COMPLETE
Dim doc As HTMLDocument
Is there a way to index this table with VBA or will I have to contact the company and attempt to get access to the product ID so I can navigate to the page directly?
Regarding my comment below, here is the code that a recorded macro yeilds:
ActiveCell.FormulaR1C1 = _
"http://support.automation.siemens.com/WW/llisapi.dll?func=cslib.csinfo&lang=en&objid=6ES7194-1AA01-0XA0&caller=view"
Range("F9").Select
With ActiveSheet.QueryTables.Add(Connection:= _
"URL;http://support.automation.siemens.com/WW/llisapi.dll?func=ll&objid=7224052&nodeid0=10997566&caller=view&lang=en&siteid=cseus&aktprim=0&objaction=csopen&extranet=standard&viewreg=WW" _
, Destination:=Range("$F$9"))
.FieldNames = True
.RowNumbers = False
I know where to find the string: URL;http://support.automation.siemens.com/WW/llisapi.dll?func=ll&objid=7224052&nodeid0=10997566&caller=view&lang=en&siteid=cseus&aktprim=0&objaction=csopen&extranet=standard&viewreg=WW, but I don't know how to save it to a variable.

Not sure I exactly understand your question, but here is some code that will get the source code behind the table of interest. You can extract the data of interest using functions like "instr" and "mid"
' open IE, navigate to the website of interest and loop until fully loaded
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = True
.navigate "http://support.automation.siemens.com/WW/view/en/7224052"
.Top = 50
.Left = 530
.Height = 400
.Width = 400
Do Until Not ie.Busy And ie.ReadyState = 4
DoEvents
Loop
End With
' Assign the source code behind the page to a variable
my_var = ie.document.frames(3).document.DocumentElement.innerhtml
' Extract the url for the "Product life cycle" table
pos_1 = InStr(1, my_var, "product life cycle", vbTextCompare)
pos_2 = InStr(pos_1, my_var, "/WW/llisapi", vbTextCompare)
pos_3 = InStr(pos_2, my_var, """><", vbTextCompare)
pos_4 = InStr(pos_3, my_var, """/>", vbTextCompare)
table_url = Mid(my_var, pos_2, pos_3 - pos_2)
table_url = Replace(table_url, "amp;", "", 1, -1, vbTextCompare)
table_url = "http://support.automation.siemens.com" & table_url
' navigate to the table url
ie.navigate table_url
Do Until Not ie.Busy And ie.ReadyState = 4
DoEvents
Loop
' assign the source code for this page to a variable and extract the desired information
my_var2 = ie.document.body.innerhtml
pos_1 = InStr(1, my_var2, "ET 200X, basic modules,", vbTextCompare)
' close ie
ie.Quit

I have had problems getting ron's code to work, I think becuase IE doesn't work easily with frames. Below is some code that will extract some of the data from the table you have mentioned, it so far doesn't handle the diagrams.
Sub FrameStrip()
Dim oFrames As Object
Dim tdelements As Object
Dim tdElement As Object
Dim oFrame As MSHTML.HTMLFrameElement
Dim oElement As Object
Dim sString As String
Dim myVar As Variant
Dim sLinks() As String
Dim i As Integer
Dim bfound As Boolean
Dim url As String
Dim oIE As InternetExplorer
Set oIE = New InternetExplorer
url = "http://support.automation.siemens.com/WW/view/en/7224052"
'Set address for use with relative source names
myVar = Split(url, "/")
sString = myVar(0) & "//" & myVar(2)
oIE.navigate url
oIE.Visible = True
Do Until (oIE.readyState = 4 And Not oIE.Busy)
DoEvents
Loop
Set oFrames = oIE.document.getElementsByTagName("frame")
ReDim sLinks(oFrames.Length)
'Get the source locations for each frame
i = 0
For Each oFrame In oFrames
sLinks(i) = sString & (oFrame.getAttribute("src"))
i = i + 1
Next oFrame
'Go through each frame to find the table
i = 0
bfound = False
Do While i < UBound(sLinks) And bfound = False
oIE.navigate sLinks(i)
Do Until (oIE.readyState = 4 And Not oIE.Busy)
DoEvents
Loop
Set oElement = oIE.document.getElementById("produktangaben")
bfound = IsSet(oElement)
i = i + 1
Loop
Set tdelements = oElement.getElementsByTagName("td")
'Display information about table
sString = ""
For Each tdElement In tdelements
Debug.Print tdElement.innerText
sString = sString & tdElement.innerText
Next tdElement
End Sub
Function IsSet(ByRef oElement As Object) As Boolean
Dim tdelements As Object
Dim bSet As Boolean
bSet = True
On Error GoTo ErrorSet
Set tdelements = oElement.getElementsByTagName("td")
On Error GoTo 0
Cleanup:
On Error Resume Next
Set tdelements = Nothing
On Error GoTo 0
IsSet = bSet
Exit Function
ErrorSet:
bSet = False
GoTo Cleanup:
End Function

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Scraping the Business name using Web class="search-item-header" - excel

Related

Need to add a Regxp to Bulk Email Extractor. - excel VBA

Excel VBA data scrape - not all data being pulled

Web scraping with getElementsByTagName()

VBA HTML Listing Info Pull

Pulling Data from an embedded web page with VBA Excel

Categories

Resources