I'm trying to access webpage elements that I may need to scrape.
I have a handle on how to access headers by tag name but I am having trouble with the div.address tag in the image.
The Inspect Element view
My code. Problem line is marked with +++++++:
Sub gettitleheader()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
On Error GoTo err_clear
Cells(i, 2) = doc.getelementsbytagname("h1")(0).innerText
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
On Error GoTo err2_clear
Cells(i, 3) = doc.getelementsbytagname("address")(0).innerHTML '+++++++
err2_clear:
If Err2 <> 0 Then
Err2.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 4)).Columns.AutoFit
Next i
End Sub
In the image below, the h1 tag is scraped and populated but not the address text.
Final output image, (I'm trying to fill that empty box)
Answered my own question! so proud!
Sub gettitleheader()
Dim wb As Object
Dim doc As Object
Dim y As String
Dim z As String
Dim sURL As String
Dim lastrow As Long
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
On Error GoTo err_clear
Cells(i, 2) = doc.getElementsByTagName("h1")(0).innerText
y = doc.getElementsByClassName("address")(0).innerText
Do While (InStr(y, Chr(10)))
y = Right(y, Len(y) - 1)
Loop
z = y
Cells(i, 3) = z
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Related
I wrote excel macro to fetch data from multiple pages ( here around 25-40 pages ) . I have managed to change pages and scrape all pages from every page .
Sub Fetch_Data()
Dim IE As Object
Dim httpReq As Object
Dim HTMLdoc As Object
Dim resultsTable As Object
Dim tRow As Object, tCell As Object
Dim destCell As Range
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = False
'Application.ScreenUpdating = False
Application.StatusBar = "Data Fetching in progress, please wait..."
IE.Navigate "https://www.bseindia.com/markets/debt/TradenSettlement.aspx" 'load the Backshop Loan Locator page
Do
DoEvents
Loop Until IE.ReadyState = 4
Set HTMLdoc = IE.Document
'LR = Cells(Rows.Count, 1).End(xlUp).Row
With ActiveSheet
'.Cells.ClearContents
Set destCell = .Range("A1")
End With
Set resultsTable = HTMLdoc.getElementById("ContentPlaceHolder1_GridViewrcdsFC")
For Each tRow In resultsTable.Rows
For Each tCell In tRow.Cells
destCell.Offset(tRow.RowIndex, tCell.cellIndex).Value = tCell.innerText
Next
Next
'________________________________________________________________________________________________________________________
'Go to Next page
'IE.Navigate "javascript:__doPostBack('ctl00$ContentPlaceHolder1$GridViewrcdsFC','Page$2')"
i = 2
For i = 2 To 50
If i = 2 Then
Url = "javascript:__doPostBack('ctl00$ContentPlaceHolder1$GridViewrcdsFC','Page$2')"
On Error GoTo ErrorHandler
ElseIf i = 3 Then
Url = "javascript:__doPostBack('ctl00$ContentPlaceHolder1$GridViewrcdsFC','Page$3')"
On Error GoTo ErrorHandler
ElseIf i = 4 Then
Url = "javascript:__doPostBack('ctl00$ContentPlaceHolder1$GridViewrcdsFC','Page$4')"
On Error GoTo ErrorHandler
ElseIf i = 5 Then
Url = "javascript:__doPostBack('ctl00$ContentPlaceHolder1$GridViewrcdsFC','Page$5')"
On Error GoTo ErrorHandler
ErrorHandler:
GoTo XYZ
End If
IE.Navigate Url
Do
DoEvents
Loop Until IE.ReadyState = 4
Url = ""
LR = Cells(Rows.Count, 1).End(xlUp).Row - 1
With ActiveSheet
'.Cells.ClearContents
Set destCell = .Range("A" & LR)
End With
Set resultsTable = HTMLdoc.getElementById("ContentPlaceHolder1_GridViewrcdsFC")
For Each tRow In resultsTable.Rows
For Each tCell In tRow.Cells
destCell.Offset(tRow.RowIndex, tCell.cellIndex).Value = tCell.innerText
Next
Next
Next i
'________________________________________________________________________________________________________________________
XYZ: IE.Quit
Application.StatusBar = "Data Fetching Completed"
MsgBox ("Data Successfully Fetched")
Application.StatusBar = ""
Dim lrow As Long
Dim index As Long
Dim header As String
header = Range("A1").Value
lrow = Range("A" & Rows.Count).End(xlUp).Row
For index = 2 To lrow
If Range("A" & index).Value = header Then Rows(index).Delete
Next
End Sub
I want to change pages automatically without writing every page , I tried something like below , but pages are not getting changed , how to loop through pages :
For i = 2 To 4
x = "Page$" + CStr(i)
Url = "javascript:__doPostBack('ctl00$ContentPlaceHolder1$GridViewrcdsFC'," & x & ")"
On Error GoTo ErrorHandler
ErrorHandler:
GoTo XYZ
You have to look if there are url links to the other pages on the current page, find a tag and loop all the webpages. You can also look voor the url of each page and hardcode it.
Example with urls beneath tag "a":
Set AElements = HTMLDoc.getElementsByTagName("a")
For Each AElement In AElements
If AElement.id = "xxxxxxxxx" Then
Cells(Cell, 27) = AElement.src 'I write URL in the 27th column
'AElement.href
End If
Next AElement
I have code that loops through all Outlook emails under a subfolder and extracts the body of the email based on the subject. Code takes a lot of time to loop through all emails as there are thousands of them.
How do I modify the code to append data, extracted from the latest emails, to the existing file instead of looping through all the emails and overwriting again & again?
Let's say I want to run the code every day to get the prior day's email data.
Option Explicit
Sub FinalMacro()
Application.DisplayAlerts = False
Dim iCounter As Integer
'iCounter = 1
Dim wkb As Workbook
Set wkb = ThisWorkbook
Sheets("Sheet1").Cells.Clear
' point to the desired email
Const strMail As String = "emailaddress#outlook.com"
Dim oApp As Outlook.Application
Dim oMapi As Outlook.MAPIFolder
'Dim oMail As Outlook.MailItem
Dim x As Long, y As Long
Dim destCell As Range
Dim i As Long
Dim oItem As Object
With ActiveSheet
Set destCell = .Cells(Rows.Count, "A").End(xlUp)
End With
On Error Resume Next
Set oApp = GetObject(, "OUTLOOK.APPLICATION")
If (oApp Is Nothing) Then Set oApp = CreateObject("OUTLOOK.APPLICATION")
On Error GoTo 0
Set oMapi = oApp.GetNamespace("MAPI").Folders(strMail).Folders("inbox").Folders("Other mails")
For Each oItem In oMapi.Items
If oItem.Subject = "Volume data" Then
' get html table from email object
Dim HTMLdoc As MSHTML.HTMLDocument
Dim tables As MSHTML.IHTMLElementCollection
Dim table As MSHTML.HTMLTable
Set HTMLdoc = New MSHTML.HTMLDocument
With HTMLdoc
.Body.innerHTML = oItem.HTMLBody
Set tables = .getElementsByTagName("table")
End With
Dim t As Long, r As Long, c As Long
Dim eRow As Long
For t = 0 To tables.Length - 1
eRow = Sheets(1).Cells(Rows.Count, 1).End(xlUp).Offset(1, 0).Row
For r = 0 To (tables(t).Rows.Length - 1)
For c = 0 To (tables(t).Rows(r).Cells.Length - 1)
Range("A" & eRow).Offset(r, c).Value = tables(t).Rows(r).Cells(c).innerText
Next c
Next r
eRow = Sheets(1).Cells(Rows.Count, 1).End(xlUp).Offset(1, 0).Row
Next t
Cells(eRow, 1) = "Date & Time of Receipt:" & " " & oItem.ReceivedTime
Cells(eRow, 1).Interior.Color = vbRed
Cells(eRow, 1).Font.Color = vbWhite
Cells(eRow, 1).Columns.AutoFit
Set oApp = Nothing
Set oMapi = Nothing
Set HTMLdoc = Nothing
Set tables = Nothing
wkb.Save '"C:\Users\Desktop\Trial_1.xlsm"
End If
Next oItem
Application.DisplayAlerts = True
End Sub
To quickly select (filter) latest emails, you can use Items.Restrict.
To use your workbook for the accumulative storage of information, you just need not to erase the sheet, but to find the last filled line and add the content from the letters after it.
Smth like (not tested):
Set oMapi = oApp.GetNamespace("MAPI").Folders(strMail).Folders("inbox").Folders("Other mails")
Set wkb = ThisWorkbook
Set append_ws = wkb.Sheets("Sheet1") ' this worksheet is for appending
'Sheets("Sheet1").Cells.Clear ' - remove this statement
' set filter to: non-flagged mailitems received < 1 day ago
flt = "[FlagStatus] <> 1 And [MessageClass]='IPM.Note' And [ReceivedTime]>='" & _
Format(Now - 1, "ddddd 0:00") & "'"
Set Restricted = oMapi.Items.Restrict(flt)
For I = Restricted.Count To 1 Step -1
Set oItem = Restricted(I)
If oItem.Subject = "Volume data" Then
content_from_email = "smth from letter" ' get the content from the letter
lastrow = append_ws.Cells(append_ws.Rows.Count, 1).End(xlUp).row + 1
append_ws.Cells(lastrow, 1).Value = content_from_email
oItem.MarkAsTask olMarkComplete ' set flag to the processed items
oItem.Save
End If
Next I
I have made the macros script which retrieves the data from the URL. What I need is that, I need to increase the date one by one and get the data for each. the URL is like this :
https://www.ukdogracing.net/racecards/01-05-2017/monmore
Ia m able to get the data with this script :
Sub GetData()
Dim IE As Object
Dim doc As Object
Dim strURL As String
Dim I As Integer
For I = 1 To 5
strURL = "https://www.ukdogracing.net/racecards/01-05-2017/monmore" + Trim(Str(I))
Set IE = CreateObject("InternetExplorer.Application")
With IE
.navigate strURL
Do Until .ReadyState = 4: DoEvents: Loop
Do While .Busy: DoEvents: Loop
Set doc = IE.Document
GetAllTables doc
.Quit
End With
Next I
End Sub
Sub GetAllTables(doc As Object)
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long
Dim ThisLink As Object 'variable for <a> tags
Set ws = Worksheets.Add
For Each tbl In doc.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ws.Range("B" & nextrow)
rng.Offset(, -1) = "Table " & tabno
For Each rw In tbl.Rows
For Each cl In rw.Cells
rng.Value = cl.outerText
Set rng = rng.Offset(, 1)
I = I + 1
Next cl
nextrow = nextrow + 1
Set rng = rng.Offset(1, -I)
I = 0
Next rw
Next tbl
I = Range("B" & Rows.Count).End(xlUp).Row 'last row with data
Do While Cells(I, 1).Value = "" 'will loop until first not blank found in column A (starting from last row of data, from end to start)
For Each ThisLink In doc.getElementsByTagName("a") 'we check all <a> tags
If ThisLink.innerText = Cells(I, 2).Value Then Cells(I, 1).Value = ThisLink.href 'If the innertext is the name of the race, in column A we add link
Next ThisLink
I = I - 1 'we decrease row position
Loop
End Sub
But I need the script takes the date part of the URL and add one day each time till today and get the data. for example :
https://www.ukdogracing.net/racecards/01-06-2017/monmore
https://www.ukdogracing.net/racecards/01-07-2017/monmore
etc... How can I make the script to get the data for each day adding one each time.
Thanks in advance.
Replace the first sub with this one and it will run for the specified dates. I couldn't see I having any purpose so i removed it.
Sub GetData()
Dim IE As Object, doc As Object
Dim strURL As String, myDate As Date
Set IE = CreateObject("InternetExplorer.Application")
With IE
For myDate = CDate("01-05-2017") To CDate("01-09-2017")
strURL = "https://www.ukdogracing.net/racecards/" & Format(myDate, "mm-dd-yyyy") & "/monmore" ' Trim(Str(I))
.navigate strURL
Do Until .ReadyState = 4: DoEvents: Loop
Do While .Busy: DoEvents: Loop
Set doc = IE.Document
GetAllTables doc
Next myDate
.Quit
End With
End Sub
I am doing IE automation using VBA (Basically I open IE and goto the specific URL from the sheet and then login using credentials from the sheet and then extract data from the webpage to excel) This has to happen for 20 websites so I added for loop and it works fine.
What I want is, in case of any error occurs with in the loop then loop has to restart.
I also tried "on error got 0, on error got -1" but it did not work.
Below is my Code - Kindly pardon me for poor coding I am new to VBA.
Sub Get_Data()
Sheets("Sheet2").Select
Range("E2").Select
Range("H6:H120").ClearContents
Dim IE As Object
Set IE = CreateObject("InternetExplorer.application")
IE.Visible = True
Dim E As Long
Dim S As Long
E = Range("A" & Rows.Count).End(xlUp).Row
JumpToHere:
For j = S To E
S = Range("H" & Rows.Count).End(xlUp).Row
Sheets("Sheet2").Select
Range("E" & S).Select
ActiveCell.Offset(1, -2).Select
Dim X As Variant
X = ActiveCell.Value
IE.navigate X
Do
If IE.ReadyState = 4 Then
IE.Visible = True
Exit Do
Else
DoEvents
End If
Loop
ActiveCell.Offset(0, 1).Select
Dim Y As Variant
Y = ActiveCell.Value
IE.document.all("username").Value = Y
ActiveCell.Offset(0, 1).Select
Dim Z As Variant
Z = ActiveCell.Value
IE.document.all("password").Value = Z
IE.document.all("merchant_login_submit_button").Click
Application.Wait (Now + TimeValue("0:00:8"))
Set ElementCol = IE.document.getElementsByTagName("span")
For Each link In ElementCol
If link.innerHTML = "Authentication Failed" Then
ActiveCell.Offset(0, 3).Value = "Authentication Failed"
GoTo JumpToHere
End If
Next
Set tags = IE.document.getElementsByTagName("input")
For Each tagx In tags
If tagx.Value = "Continue to Control Panel" Then
tagx.Click
Application.Wait (Now + TimeValue("0:00:3"))
Exit For
End If
Next
Set ElementCol = IE.document.getElementsByTagName("a")
For Each link In ElementCol
If link.innerHTML = "Reports" Then
link.Click
End If
Next
Application.Wait (Now + TimeValue("0:00:06"))
Dim checkdate As Integer
checkdate = Format(Date, "dd") - 1
IE.document.getElementById("snapshot_group_by").Value = "payment_processor"
IE.document.getElementById("snapshot_end_date_day").Value = checkdate
IE.document.all("reports_submit_button").Click
Application.Wait (Now + TimeValue("0:00:3"))
Dim ws As Worksheet
Dim rng As Range
Dim tbl As Object
Dim rw As Object
Dim cl As Object
Dim tabno As Long
Dim nextrow As Long
Dim I As Long
Set ws = Worksheets.Add
For Each tbl In IE.document.getElementsByTagName("TABLE")
tabno = tabno + 1
nextrow = nextrow + 1
Set rng = ws.Range("B" & nextrow)
rng.Offset(, -1) = "Table " & tabno
For Each rw In tbl.Rows
For Each cl In rw.Cells
rng.Value = cl.outerText
Set rng = rng.Offset(, 1)
I = I + 1
Next cl
nextrow = 0
Set rng = rng.Offset(1, -I)
I = 0
Next rw
Next tbl
ws.Cells.ClearFormats
Sheets("Sheet2").Select
ActiveCell.Offset(0, 3).Value = ActiveSheet.Previous.Range("F4")
Application.DisplayAlerts = False
ActiveSheet.Previous.Delete
Application.DisplayAlerts = True
Set ElementCol = IE.document.getElementsByTagName("a")
For Each link In ElementCol
If link.innerHTML = "Logout" Then
link.Click
End If
Next
Next j
End Sub
Sounds like your real problem is that your code isn't properly waiting. Instead of Application.Wait, use a proper waiting loop any time you invoke the IE.Navigate or any element .Click or form .Submit event.
VBA HTML not running on all computers
Otherwise, you don't have any active error-trapping in your code. Wrap your loop with On Error statements, as below.
The first one, On Error GoTo MyErrorHandler instructs the program of what to do if an error is encountered within the loop. If there's an error, the code underneath the MyErrorHandler label will execute, and resume at the NextJ label. Once the loop finishes, On Error GoTo 0 returns normal (i.e., none) error-handling. Any errors occurring outside the loop still raise an exception during runtime.
Option Explicit
Sub Get_Data()
'// Dim your variables
'// Executable code starts here
JumpToHere:
For j = S To E
On Error GoTo MyErrorHandler
' Now ANY ERROR, ANYWHERE in the loop will go to the error handler
NextJ:
Next j
'// Code below this line won't be subject to the error handler
On Error GoTo 0
'// more code if you have it
' Exit gracefully if there was no error:
Exit Sub
'// Here is the error handler:
MyErrorHandler:
Err.Clear()
Resume NextJ
End Sub
If you truly want to re-start the loop, then instead of NextJ, do Resume JumpToHere.
I found this article explaining how to scrape certain tags from a website using Excel VBA.
The code below gets the content from the first <p> tag that it finds:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.title
On Error GoTo err_clear
Cells(i, 3) = doc.GetElementsByTagName("p")(0).innerText
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
I'd like to make the scraper get all the content that is within a <p> tag on a webpage. So I guess a foreach functionality of some kind is missing.
How can the content from multiple <p> tags be collected?
UPDATE
The working code!
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
counter = counter + 1
Cells(i, counter + 2).Value = Cells(counter + 1).Value & el.innerText
Next el
counter = 0
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 10)).Columns.AutoFit
Next i
End Sub
You're almost there! doc.GetElementsByTagName("p") returns a collection of HTMLParagraphElement objects of which you accessed the first entry using doc.GetElementsByTagName("p")(0). As you allude to, a For Each loop would let you access each in turn:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
Cells(i, 3).Value = Cells(i, 3).Value & ", " & el.innerText
Next el
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
If you just need to get the content of the webpage in plain text this code is more concise
Function WEBSITE_TEXT(Destination As String) As String
' Requires a reference to Microsoft XML, v6.0
' Draws on the stackoverflow answer at bit.ly/parseXML
Dim myRequest As XMLHTTP60
Dim myDomDoc As DOMDocument60
' Check and clean inputs
On Error GoTo exitRoute
If Destination = "" Then
WEBSITE_TEXT = ""
Exit Function
End If
' Read the XML data from the Google Maps API
Set myRequest = New XMLHTTP60
myRequest.Open "GET", Destination, False
myRequest.send
' Parse HTML content
Dim html As New HTMLDocument
Dim text As String
html.body.innerHTML = myRequest.responseText
' Return the website content
text = html.body.innerText
If Not html Is Nothing Then WEBSITE_TEXT = text
exitRoute:
' Tidy up
text = ""
Set myRequest = Nothing
End Function