I've got a working code but I'd like to replace my Drop-Down with Content Control, because I need to be able to also manually type in a value.
The value inside is a list from a https, this string works completely fine, so please ignore.
Here's my code:
Dim MyRequest As Object
Dim Data() As String
Dim i As Integer
Dim j As Integer
Dim maxi As Integer
Set MyRequest = CreateObject("WinHttp.WinHttpRequest.5.1")
MyRequest.Open "GET", _
"https... (This is hidden for security resons, /csv/)"
' Send Request.
MyRequest.Send
'And we get this response
'MsgBox MyRequest.ResponseText
Data = Split(MyRequest.ResponseText, "|")
If UBound(Data()) > 25 Then
maxi = 25
Else
maxi = UBound(Data())
End If
For j = 1 To 6
ActiveDocument.FormFields("Dropdown" & j).DropDown.ListEntries.Clear
For i = 0 To maxi - 1
ActiveDocument.FormFields("Dropdown" & j).DropDown.ListEntries.Add Name:=Data(i)
Next i
Next j
End Sub
You should not use content controls and formfields in the same document. They were not designed to be used that way and doing so is a known source of problems.
As you observed, dropdown formfields don’t support text entry. To provide that facility, you could provide an option in the dropdown for 'free text' and use an on-exit macro with an Inputbox to insert the user’s 'free text' into the dropdown. For example, suppose you have a dropdown with 5 items, the last of which offers free text entry (e.g. an 'Other' option). Adding the following on-exit macro to the formfield will provide that:
Sub FreeText()
Dim StrNew As String, i As Long
With Selection.FormFields(1).DropDown
i = .ListEntries.Count
If .Value = i Then
StrNew = Trim(InputBox("Input your text", "Data Entry", .ListEntries(i).Name))
If StrNew = vbNullString Then Exit Sub
.ListEntries(i).Delete
.ListEntries.Add StrNew
.Value = i
End If
End With
End Sub
Related
My system is working with Windows 10 64bit and Office 2016
I am a bit familiar with Excel VBA, and I am trying to learn web scraping using VBA. Unfortunately, there is little information on in-depth digging on web data.
The website I want to scrape data from is bizbuysell.com from the seller's offers such as
Sample URL 1
Sample URL 2
There is a section that starts with the headline Detailed Information
The HTML code is: Detailed Information
<h3>Detailed Information</h3>
Location:
Pinellas County, FL
Inventory:
Included in asking price
Employees:
8 FT
I want to scrape data from this section.
The problem is that there are some 18 data labels and their respective values possible, but only those are shown for which the seller has entered data.
My idea was to search for all possible data labels and if they are not available then next data field
I tried it with the following code, but Obviously I made a mistake
For Each ele In doc.getElementsByClassName("listingProfile_details")
txt = ele.parentElement.innerText
If Left(txt, 8) = "Location" Then
location = Trim(Mid(txt, InStrRev(txt, ":") + 1))
ElseIf Left(txt, 4) = "Inventory" Then
inventory = Trim(Mid(txt, InStrRev(txt, ":") + 1))
.
.
.
End If
Next ele
I hope that someone can show me the correct VBA code to check for all 18 possible data labels and the respective data
Thank you so much!
Tony
One way it to gather a nodeList of the dt/dd elements and loop it with a step 2 so you can access the label at n indices and the value at n + 1.
To handle differing numbers of labels being present, you can initialise a fresh dictionary, with all the possible labels as keys, and the associated values as vbNullString, during the loop over urls, such that for each new XHR request you get a new dictionary ready to populate with the labels that are found. By using .Exists test, you only update the values for keys (labels) that are found at the current URI.
You can store all results in an array to write out to the sheet in one go at end.
There are lots of additional notes within the code.
Option Explicit
Public Sub GetDetailedBizBuySellInfo()
Dim http As Object, urls() As Variant
Dim html As MSHTML.HTMLDocument 'VBE > Tools > References > Microsoft HTML Object Library
urls = Array("https://www.bizbuysell.com/Business-Opportunity/covid-friendly-commercial-cleaning-est-30-years-100k-net/1753433/?d=L2Zsb3JpZGEvaGlsbHNib3JvdWdoLWNvdW50eS1idXNpbmVzc2VzLWZvci1zYWxlLzI/cT1hVEk5T0RFc01qQXNNekFzTnpnbWJtRndQV1UlM0Q=", _
"https://www.bizbuysell.com/Business-Opportunity/Established-Cleaning-Business-Tampa-St-Pete/1849521/?utm_source=bizbuysell&utm_medium=emailsite&utm_campaign=shtmlbot&utm_content=headline")
Set http = CreateObject("MSXML2.XMLHTTP")
Set html = New MSHTML.HTMLDocument
Dim url As Long, results() As Variant
ReDim results(1 To UBound(urls) + 1, 1 To 19) 'size the final output array. _
There will be the number of urls as row count, the number of labels as column count + 1 to store the url itself. You need to update the list of labels below. See GetBlankDetailedInformationDictionary
With http
For url = LBound(urls) To UBound(urls) 'loop url list
.Open "Get", urls(url), False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
Dim currentDetailedInformation As Scripting.Dictionary 'VBE > Tools > References > Microsoft Scripting Runtime
Set currentDetailedInformation = GetCurrentDetailedInfo(html) 'use retrieved html to return a dictionary with key as dt > strong e.g.Location; value as dd e.g. Tampa, FL
AddCurrentDetailedInfoToResults results, currentDetailedInformation, url, urls(url) 'url + 1 (zero indexed) will keep track of current row number to add to results
Next
End With
With ActiveSheet 'better to update with explicit sheet/be careful not to overwrite data already in a sheet
.Cells(1, 1).Resize(1, UBound(results, 2)) = currentDetailedInformation.keys ' write out headers
.Cells(1, UBound(results, 2)) = "Url"
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results ' write out results
End With
End Sub
Public Sub AddCurrentDetailedInfoToResults(ByRef results As Variant, ByVal currentDetailedInformation As Scripting.Dictionary, ByVal url As Long, ByVal currentUrl As String)
Dim key As Variant, currentColumn As Long
For Each key In currentDetailedInformation.keys
currentColumn = currentColumn + 1 'increase column count to update results array with
results(url + 1, currentColumn) = currentDetailedInformation(key)
Next
results(url + 1, currentColumn + 1) = currentUrl
End Sub
Public Function GetCurrentDetailedInfo(ByVal html As MSHTML.HTMLDocument) As Scripting.Dictionary
' Gathers a list of all the relevant dd, dt nodes within the passed in HTMLDocument.
' Requests a new blank dictionary whose keys are the labels (child strong element of dt tag)
'Updates blank dictionary, per key, where present, with dd value in a loop of step 2 as list is strong, dd, strong, dd etc.....
Dim updatedDictionary As Scripting.Dictionary, listOfLabelsAndValues As MSHTML.IHTMLDOMChildrenCollection
Set updatedDictionary = GetBlankDetailedInformationDictionary
'Css pattern to match the appropriate nodes
Set listOfLabelsAndValues = html.querySelectorAll("#ctl00_ctl00_Content_ContentPlaceHolder1_wideProfile_listingDetails_dlDetailedInformation dt > strong, #ctl00_ctl00_Content_ContentPlaceHolder1_wideProfile_listingDetails_dlDetailedInformation dd")
Dim currentIndex As Long
For currentIndex = 0 To listOfLabelsAndValues.length - 2 Step 2 'nodeList is 0 index based
'On Error Resume Next 'key (label) may not be present for current html document _
i.e. url so ignore errors when attempting to update blank dictionary via dt > strong matching on key. If label not found then value = vbNullString
Dim key As String, value As String
key = Trim$(listOfLabelsAndValues.Item(currentIndex).innerText)
value = Trim$(listOfLabelsAndValues.Item(currentIndex + 1).innerText) 'as we are looping every 2 indices 0,2,4 ....
If updatedDictionary.Exists(key) Then updatedDictionary(key) = value
'On Error GoTo 0
Next
Set GetCurrentDetailedInfo = updatedDictionary ' return updated dictionary
End Function
Public Function GetBlankDetailedInformationDictionary() As Scripting.Dictionary
Dim blankDictionary As Scripting.Dictionary, keys() As Variant, key As Long
Set blankDictionary = New Scripting.Dictionary
'' TODO Note: you would add in all 18 labels into array below.
keys = Array("Location:", "Type:", "Inventory:", "Real Estate:", "Building SF:", _
"Building Status:", "Lease Expiration:", "Employees:", "Furniture, Fixtures, & Equipment (FF&E):", _
"Facilities:", "Competition:", "Growth & Expansion:", "Financing:", "Support & Training:", _
"Reason for Selling:", "Franchise:", "Home-Based:", "Business Website:")
For key = LBound(keys) To UBound(keys)
blankDictionary(keys(key)) = vbNullString 'add blank entry to dictionary for each label
Next
Set GetBlankDetailedInformationDictionary = blankDictionary
End Function
The single most annoying feature in Excel's built-in VBA editor is—in my opinion—the aggressive autoformatting of the code, which insists on rewriting what I have typed as soon as the cursor leaves the line. It is particularly distressing that the editor collapses all whitespace, thus preventing any meaningful code alignment. For example, if I try to align a sequence of assignments by the equals sign with values aligned by the decimal separator:
price = 10.01
quantity = 3.2
vat = 0.11
the editor inevitably scrambles it by collapsing all spaces:
price = 10.01
quantity = 3.2
vat = 0.11
Is there any way to avoid this kind unwelcome autoformatting?
Assignment cosmetics :-)
There's neither a special VBE property to change the VBE (autoformatting) options directly nor a way to do it programatically. - So afaik VBE irrevocably forces autoformatting upon the user by partial workarounds.
a) Class method
For the sake of the art and just for fun an actually (very) basic class approach to give you a starting idea; assignment arguments are passed as strings allowing any optical formatting - if that's what you really want:
Example call in current module
Sub ExampleCall()
Dim x As New cVars
x.Add "price = 11.11" ' wrong assignment
'...
x.Add "price = 10.01" ' later correction
x.Add "quantity = 1241.01"
x.Add "vat = 0.11"
Debug.Print "The price is $ " & x.Value("price")
End Sub
Class module cVars
Option Explicit
Private dict As Object
Sub Add(ByVal NewValue As Variant)
'split string tokens via equal sign
Dim tmp
tmp = Split(Replace(Replace(NewValue, vbTab, ""), " ", "") & "=", "=")
'Identify key and value item
Dim myKey As String, myVal
myKey = tmp(0)
myVal = tmp(1): If IsNumeric(myVal) Then myVal = Val(myVal)
'Add to dictionary
If dict.exists(myKey) Then
dict(myKey) = myVal
Else
dict.Add myKey, myVal
End If
'Debug.Print "dict(" & myKey & ") =" & dict(myKey)
End Sub
Public Property Get Value(ByVal myVarName As String) As Variant
'get variable value
Value = dict(myVarName)
End Property
Private Sub Class_Initialize()
'set (late bound) dict to memory
If dict Is Nothing Then Set dict = CreateObject("Scripting.Dictionary")
End Sub
Private Sub Class_Terminate()
Set dict = Nothing
End Sub
Edit #1 as of 3/3 2021
b) Rem Evaluation method
Once again only for the sake of the art a way to read assignments entered into outcommented code lines via, yes via Rem (heaving a deep sigh for this archaic use originating from former Basic times) as it allows to format data with any wanted spaces or tabs and won't be mixed up hopefully with current outcommentings via apostrophe '.
This Test procedure only needs the usual declarations plus some assignment calls as well as the mentioned Rem part. Two simple help procedures get code lines, analyze them via a dictionary class cVars and eventually assign them.
Note that the following example
needs a library reference to Microsoft Visual Basic Extensibility 5.3 and
uses the unchanged class cVars of section a) simply to avoid rewriting it.
Option Explicit
Private Const THISMODULE As String = "Module1" ' << change to current code module name
Sub Test() ' procedure name of example call
'Declare vars
Dim price As Double: Assign "price", price
Dim quantity As Double: Assign "quantity", quantity
Dim vat As Double: Assign "vat", vat
'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'Enter assignments via Rem(ark)
'(allowing any user defined formatting therein)
'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Rem price = 10.01
Rem quantity = 1241.01
Rem vat = 0.11
'~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Debug.Print quantity & " à $" & price & " = " & Format(quantity * price, "$#,##0.00")
End Sub
Help procedure Assign evaluating Rem codelines in procedure Test
Sub Assign(ByVal myVarName As String, ByRef myvar)
Const MyProc As String = "Test"
Dim codelines
getCodelines codelines, THISMODULE, ProcedureName:=MyProc
'Debug.Print Join(codelines, vbNewLine)
Dim x As New cVars ' set class instance to memory
Dim line As Variant, curAssignment
For Each line In codelines
curAssignment = Split(line, "Rem ")(1) ' remove Rem prefix from codelines
If curAssignment Like myVarName & "*" Then
x.Add curAssignment
myvar = x.Value(myVarName)
End If
Next
End Sub
Help procedure getCodelines
Called by above proc Assign. Returns the relevant Rem Codelines from the calling procedure Test. - Of course it would have been possible to filter only one codeline.
Sub getCodelines(ByRef arr, ByVal ModuleName As String, ByVal ProcedureName As String)
Const SEARCH As String = "Rem "
'a) set project
Dim VBProj As Object
Set VBProj = ThisWorkbook.VBProject
If VBProj.Protection = vbext_pp_locked Then Exit Sub ' escape locked projects
'b) set component
Dim VBComp As Object
Set VBComp = VBProj.VBComponents(ModuleName)
Dim pk As vbext_ProcKind
'd) get relevant code lines
With VBComp.CodeModule
'count procedure header lines
Dim HeaderCount As Long: HeaderCount = .ProcBodyLine(ProcedureName, pk) - .ProcStartLine(ProcedureName, pk)
'get procedure code
Dim codelines
codelines = Split(.lines(.ProcBodyLine(ProcedureName, pk), .ProcCountLines(ProcedureName, pk) - HeaderCount), vbNewLine)
'filter code lines containing "Rem" entries
codelines = Filter(codelines, SEARCH, True)
End With
'return elements
arr = codelines
End Sub
Don't forget to integrate the class module CVars from section a)!
I have replicated the code in Excel macro to search a website with excel data and extract specific results and then loop for next value, although I get a error on the line URL_Get_SKU_Query1 = entityRange.Offset(0, 1).Value2 stating "object variable or with block variable not set"
So I am just trying to replicate the code for another website.
This code pulls in a certain text and spits out a value from the webiste.
So I would like to enter in MFR SKU in sheet 1 as such:
Name // SKU // Price
WaterSaverFaucet // SS902BC
After I have created a macro button on sheet 2 and clicking it
Then have it spit out the price.
So that it ends up like this below:
Name // SKU // Price
WaterSaverFaucet // SS902BC // 979.08
I would need this in order to look up multiple items on a website.
Sub LoopThroughBusinesses1()
Dim i As Integer
Dim SKU As String
For i = 2 To Sheet1.UsedRange.Rows.Count
SKU = Sheet1.Cells(i, 2)
Sheet1.Cells(i, 3) = URL_Get_SKU_Query1(SKU)
Next i
End Sub
Function URL_Get_SKU_Query1(strSearch As String) As String ' Change it from a Sub to a Function that returns the desired string
' strSearch = Range("a1") ' This is now passed as a parameter into the Function
Dim entityRange As Range
With Sheet2.QueryTables.Add( _
Connection:="URL;https://www.neobits.com/SearchBySKU.aspx?SearchText=" & strSearch & "&safe=active", _
Destination:=Sheet2.Range("A1")) ' Change this destination to Sheet2
.BackgroundQuery = True
.TablesOnlyFromHTML = True
.Refresh BackgroundQuery:=False
.SaveData = True
End With
' Find the Range that has "Price"
Set entityRange = Sheet2.UsedRange.Find("Price")
' Then return the value of the cell to its' right
URL_Get_SKU_Query1 = entityRange.Offset(0, 1).Value2
' Clear Sheet2 for the next run
Sheet2.UsedRange.Delete
End Function
Your logic is flawed unfortunately. You cannot simply take the mechanism from one webpage and assume it works for the next. In this case the solution you are trying will not work. When you enter a SKU into search what actually happens is a page re-direct (302). Not the construction of an url as you have tried. You are getting the error you see primarily due to hitting a page not found - though surfaces due to your element not being found on the 404 page.
Instead, you can use the construct the page in question actually uses for initial url and then you can use xmlhttp which will follow the re-direct as follows:
VBA:
Option Explicit
Public Sub GetPrices()
Dim xhr As XMLHTTP60, html As HTMLDocument, ws As Worksheet, i As Long
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set xhr = New XMLHTTP60
Set html = New HTMLDocument
Dim allData()
allData = ws.UsedRange.Value
With xhr
For i = 2 To UBound(allData, 1)
.Open "GET", "https://www.neobits.com/search?keywords=" & allData(i, 2), False
.send
Dim price As Object
html.body.innerHTML = .responseText
Set price = html.querySelector("#main_price")
If Not price Is Nothing Then
allData(i, 3) = price.innerText
Else
allData(i, 3) = "No price found"
End If
Set price = Nothing
Next
End With
ws.Cells(1, 1).Resize(UBound(allData, 1), UBound(allData, 2)) = allData
End Sub
I assume your page set-up, in Sheet1, is as follows:
Required project references:
The two references bounded in red are required. Press Alt+F11 to open the VBE and then go Tools > References and add references. You may have a different version number for xml library - in which case reference will need changing as will code references of
Dim xhr As XMLHTTP60
and
New XMLHTTP60
To run this code:
Press Alt+F11 to open the VBE > Right click in project explorer > Add standard module. Paste code into that standard module > Select anywhere inside the code and press F5, or hit the green Run arrow in the ribbon.
You could further develop, for example, to handle non 200 status codes:
Option Explicit
Public Sub GetPrices()
Dim xhr As XMLHTTP60, html As HTMLDocument, ws As Worksheet, i As Long
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set xhr = New XMLHTTP60
Set html = New HTMLDocument
Dim allData(), price As Object
allData = ws.UsedRange.Value
With xhr
For i = 2 To UBound(allData, 1)
.Open "GET", "https://www.neobits.com/search?keywords=" & allData(i, 2), False
.send
If .Status <> 200 Then
allData(i, 3) = "Status not succeeded" '<== Little bit loose but you get the idea.
Else
html.body.innerHTML = .responseText
Set price = html.querySelector("#main_price")
If Not price Is Nothing Then
allData(i, 3) = price.innerText
Else
allData(i, 3) = "No price found"
End If
Set price = Nothing
End If
Next
End With
ws.Cells(1, 1).Resize(UBound(allData, 1), UBound(allData, 2)) = allData
End Sub
' Find the Range that has "Entity Type:"
Set entityRange = Sheet2.UsedRange.Find("Lists At:")
' Then return the value of the cell to its' right
URL_Get_SKU_Query1 = entityRange.Offset(0, 1).Value2
The problem is that Range.Find may not find what you're looking for, for various reasons. Always specify the optional parameters to that function, since it otherwise "conveniently remembers" the values from the last time it was invoked - either from other VBA code, or through the Excel UI (IOW there's no way to be 100% sure of what values it's going to be running with if you don't specify them). But even then, if Range.Find doesn't find what it's looking for, it will return Nothing - and you can't just assume that will never happen!
But, reading closer...
' Find the Range that has "Entity Type:"
Set entityRange = Sheet2.UsedRange.Find("Lists At:")
Someone's lying. Read the comment. Now read the code. Who's telling the truth? Don't write comments that say "what" - have comments say "why", and let the code say "what". Otherwise you have situations like that, where it's impossible to tell whether the comment is outdated or the code isn't right, at least not without looking at the worksheet.
In any case, you need to make sure entityRange isn't Nothing before you try to make a member call against it:
If Not entityRange Is Nothing Then
URL_Get_SKU_Query1 = entityRange.Offset(0, 1).Value2
End If
I wrote a macro to go to WU to get historical data and for the most part, it works. However, I believe that the macro is running too fast for it to pick up the data from the website.
https://www.wunderground.com/history/daily/us/tx/el-paso/KELP/date/2017-1-3
Is the website and the table I want to get is tablesaw-sortable.
I have tried the following: DoEvents and Application.Wait (Now + TimeValue("00:00:01")) to try to slow down the process.
Sub BrowseToWU()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDoc As MSHTML.HTMLDocument
Dim RowAddress As Integer
Dim WebAddress As String
Dim DateSheet As Date
Dim WkDay As Integer
Dim DateSheetName As String
'Application.ScreenUpdating = False
'Application.StatusBar = True
RowAddress = 2
IE.Visible = True
Do Until RowAddress = 60
WebAddress = Range("A" & RowAddress)
DateSheet = Right(WebAddress, 8)
DateSheetName = Right(WebAddress, 8)
WkDay = Weekday(DateSheet, vbSunday)
If WkDay < 3 Then
RowAddress = RowAddress + 1
ElseIf WkDay > 6 Then
RowAddress = RowAddress + 1
Else
IE.Navigate WebAddress
Do While IE.ReadyState <> READYSTATE_COMPLETE
Loop
Set HTMLDoc = IE.Document
DoEvents
Application.Wait (Now + TimeValue("00:00:05"))
DoEvents
ProcessHTMLPage HTMLDoc
DateSheet = Right(WebAddress, 8)
DoEvents
Application.Wait (Now + TimeValue("00:00:01"))
ActiveSheet.Name = DateSheetName
DoEvents
RowAddress = RowAddress + 1
'IE.Quit
Worksheets("Sheet1").Activate
End If
Loop
End Sub
Option Explicit
Sub ProcessHTMLPage(HTMLPage As MSHTML.HTMLDocument)
Dim HTMLTable As MSHTML.IHTMLElement
Dim HTMLTables As MSHTML.IHTMLElementCollection
Dim HTMLRow As MSHTML.IHTMLElement
Dim HTMLCell As MSHTML.IHTMLElement
Dim RowNum As Long, ColNum As Integer
'Dim IE As New SHDocVw.InternetExplorer
'Dim Ws As Worksheet
Set HTMLTables = HTMLPage.getElementsByClassName("tablesaw-sortable")
'DoEvents
For Each HTMLTable In HTMLTables
Worksheets.Add
DoEvents
Range("A1").Value = HTMLTable.className
Range("B1").Value = Now
RowNum = 2
For Each HTMLRow In HTMLTable.getElementsByTagName("tr")
'Debug.Print vbTab & HTMLRow.innerText
ColNum = 1
For Each HTMLCell In HTMLRow.Children
Cells(RowNum, ColNum) = HTMLCell.innerText
ColNum = ColNum + 1
Next HTMLCell
RowNum = RowNum + 1
Next HTMLRow
Next HTMLTable
DoEvents
'IE.Quit
End Sub
The macro is supposed to run through sheet1 picking up the web address to the historical data if it satisfies the criteria of being a certain day of the week.
IE will open and then it will kick over to the next module that will take in the data.
A new worksheet is created and the data pasted into the new worksheet.
The worksheet is renamed to the date of the data.
The web address sheet is activated again and the process starts over again.
The error I get is that the data isn't taken from the website, so the For statement ends and the web address sheet is renamed and an error occurs.
One way around this is to call the API that the page is using to get that info.
The API returns json which you can parse with a json parser. I use jsonconverter.bas. After installing the code from that link in a standard module called JsonConverter, go to VBE > Tools > References > Add a reference to Microsoft Scripting Runtime.
Finding the API:
If you press F12 to open developer tools and go to the Network tab and then press F5 to refresh any url of interest you will see the recorded web traffic. You can find the API call there.
See my answer here on how to search the network traffic using a specific observation value you expect to see in the response - this will filter the list of network traffic to those items containing the value of interest. Be sensible in selecting the value - you want something unlikely to occur elsewhere. You can also filter the network traffic to XHR only.
The API response:
The API returns json. More specifically, it returns a dictionary containing 2 keys. The second key, "observations", can be used to return a collection (denoted by []) of dictionaries (denoted by {}).
Each dictionary represents a row of the table (daily observations). You can loop this collection, and then loop the inner dictionaries, to access the table row values and reconstruct the table by populating an array. Explore example json response here.
Explanation of json structure:
click here to enlarge
Explanation of code:
The code is broken down into a number of helper subs and functions, allocating certains tasks to each, to
make code easier to debug and follow, as well as better align with Object Oriented Programming Principles.
Overall the process is:
Gather urls for Worksheet("Sheet1"). Helper function GetAllUrls.
Process those urls and only retain the dates which correspond with Tue-Thur. These are kept as strings formatted as "yyyymmdd" so can be passed to API later. This is handled by helper functions GetOnlyQualifyingUrlsDates and IncludeThisDate. IncludeThisDate performs the check for whether to include; GetOnlyQualifyingUrlsDates handles the looping and formatting of results.
Issue xmlhttp requests by looping over qualifying url dates and concatenating those into the url for the API call, then issuing the request. This is performed by the main sub GetTables.
Sheet creation, for output, is handled by helper function CreateWorksheet. This function calls another helper function, SheetExists, to ensure sheets are only created if they don't already exist, otherwise, the existing sheet by that name is used.
The resultant json response, from step 3, is passed to a helper sub WriteOutResults which accepts the json variable and the output sheet object as arguments. It extracts all the info from the json response; essentially reconstructing the table. It adds the table and headers to the appropriate sheet.
It calls helper function Epoch2Date, which handles the unix timestamp to datetime conversion for the two unix fields in the json object.
TODO:
The API key may be time limited. Add a helper function which returns the current valid key.
The API accepts start date and end date parameters in the url construct. It would be far better to issue one request for the entire range if possible, or chunked ranges e.g. months, to reduce the number of requests made. This would also reduce the likelihood of being blocked. This would mean some additional code would need to be written, before writing out results, to ensure only dates of interest are being written to sheets. Though you could write out all then simply loop all sheets and delete those that aren't wanted (perfectly doable if we are talking about 365 dates total). Personally, I would handle the include date part in the construction of the table from a single request (if possible) that has the min and max dates for entire urls listed passed as start and end date parameters. I would then write a single flat table out to one sheet as this will be much easier for later data analysis.
VBA:
Option Explicit
Public Sub GetTables()
'VBE > Tools > References > Microsoft Scripting Runtime
Dim json As Object, qualifyingUrlsDates(), urls(), url As String
Dim ws As Worksheet, wsOutput As Worksheet, i As Long, startDate As String, endDate As String
Set ws = ThisWorkbook.Worksheets("Sheet1")
urls = GetAllUrls(2, ws, "A")
qualifyingUrlsDates = GetOnlyQualifyingUrlsDates(urls)
'API key may be not be valid over time so look at obtaining by prior request
With CreateObject("MSXML2.XMLHTTP") 'issue xmlhttp request for each valid date (this would be better done using start and enddate to specify entire range _
of batches e.g. months within total range to cut down on requests
For i = LBound(qualifyingUrlsDates) To UBound(qualifyingUrlsDates)
startDate = qualifyingUrlsDates(i)
endDate = startDate ' a little verbose but useful for explaining
url = "https://api.weather.com/v1/geocode/31.76/-106.49/observations/historical.json?apiKey=6532d6454b8aa370768e63d6ba5a832e&startDate=" & startDate & "&endDate=" & endDate & "&units=e"
.Open "GET", url, False
.send
Set json = JsonConverter.ParseJson(.responseText)("observations")
Set wsOutput = CreateWorksheet(qualifyingUrlsDates(i))
WriteOutResults wsOutput, json
Next
End With
End Sub
Public Sub WriteOutResults(ByVal wsOutput As Worksheet, ByVal json As Object)
'json is a collection of dictionaries. Each dictionary is a time period reading from the day i.e. one row in output
Dim results(), item As Object, headers(), r As Long, c As Long, key As Variant
headers = json.item(1).keys 'get the headers which are the keys of each dictionary
ReDim results(1 To json.Count, 1 To UBound(headers) + 1)
For Each item In json
r = r + 1: c = 0 'increase row in results array to store results for table row
For Each key In item.keys
c = c + 1 'increase column number in results array for writing out results
Select Case key
Case "valid_time_gmt", "expire_time_gmt" 'convert unix timestamp fields to datetime
results(r, c) = Epoch2Date(item(key))
Case Else
results(r, c) = item(key)
End Select
Next
Next
With wsOutput
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetOnlyQualifyingUrlsDates(ByVal urls As Variant) As Variant
Dim i As Long, output(), counter As Long
ReDim output(1 To UBound(urls))
For i = LBound(urls) To UBound(urls)
If IncludeThisDate(urls(i)) Then 'check if weekday is to be included
counter = counter + 1
output(counter) = Format$(Right$(urls(i), 8), "yyyymmdd") 'if to include then add to output array of urls of interest
End If
Next
ReDim Preserve output(1 To counter)
GetOnlyQualifyingUrlsDates = output
End Function
Public Function IncludeThisDate(ByVal url As String) As Boolean
'tue, wed, thurs are valid
IncludeThisDate = Not IsError(Application.Match(Weekday(Right$(url, 8), vbSunday), Array(3, 4, 5)))
End Function
Public Function SheetExists(ByVal sheetName As String) As Boolean '<== function by #Rory
SheetExists = Evaluate("ISREF('" & sheetName & "'!A1)")
End Function
Public Function GetAllUrls(ByVal startRow As Long, ByVal ws As Worksheet, ByVal columnName As String) As Variant
'transpose used based on premise no more than a couple of years of dates
'startRow is start row for urls, ws is sheet where urls found, columnName is string representation of column for urls e.g. "A"
With ws
GetAllUrls = Application.Transpose(ws.Range("A" & startRow & ":A" & .Cells(.rows.Count, columnName).End(xlUp).Row).Value)
End With
End Function
Public Function CreateWorksheet(ByVal sheetName As String) As Worksheet
Dim ws As Worksheet
If SheetExists(sheetName) Then
Set ws = ThisWorkbook.Worksheets(sheetName)
'do something.... clear it? Then add new data to it?
Else
Set ws = ThisWorkbook.Worksheets.Add
ws.Name = sheetName
End If
Set CreateWorksheet = ws
End Function
Public Function Epoch2Date(ByVal E As Currency, Optional msFrac) As Date '# Schmidt http://www.vbforums.com/showthread.php?805245-EPOCH-to-Date-and-vice-versa
Const Estart As Double = #1/1/1970#
msFrac = 0
If E > 10000000000# Then E = E * 0.001: msFrac = E - Int(E)
Epoch2Date = Estart + (E - msFrac) / 86400
End Function
I have 8000 values in an Excel spreadsheet.
I need to search a website and then record a specific line of data from the website to in the Excel spreadsheet.
I found code which searches for data excel macro to search a website and extract results
Sub URL_Get_ABN_Query()
strSearch = Range("a1")
With ActiveSheet.QueryTables.Add( _
Connection:="URL;http://www.abr.business.gov.au/SearchByABN.aspx?SearchText=" & _
strSearch & "&safe=active", _
Destination:=Range("a5"))
.BackgroundQuery = True
.TablesOnlyFromHTML = True
.Refresh BackgroundQuery:=False
.SaveData = True
End With
'enter code here
End Sub
It collects the data from the website like this.
I only want the 'entity type' data line.
I can not find how to extend the code to only grab this line and input to the corresponding cell. i.e. ABN(b2)search, find input 'entity type' and paste into Company Type(c2).
Alternatively, I tried to find how to fill the information vertically instead of horizontally. I could delete the columns that are not needed. I thought this may be simpler.
I tried to record the macro with developer.
I also need to loop to the next ABN and populate the corresponding field and so on (B3>C3, B4>C4, etc.).
This is absolutely possible. You've got what I often find the hardest part, sourcing the information from another platform. To make this work I would separate it out a little bit and for simplicity use 2 sheets (Sheet1 with your known data and Sheet2 for the web data).
Loop through your table of ~8000 businesses. We can identify this from the UsedRange number of Rows. We know that the ABN is in column 2 (also known as B) so we copy that into the variable to pass to the function. The function will return the "Entity type:" to column 3 (C) of the same row.
Sub LoopThroughBusinesses()
Dim i As Integer
Dim ABN As String
For i = 2 To Sheet1.UsedRange.Rows.Count
ABN = Sheet1.Cells(i, 2)
Sheet1.Cells(i, 3) = URL_Get_ABN_Query(ABN)
Next i
End Sub
Change the subroutine you created to a Function so it returns the entity type you are after. The function will save the data into Sheet2 and then return just the Entity data that we are after.
Function URL_Get_ABN_Query(strSearch As String) As String ' Change it from a Sub to a Function that returns the desired string
' strSearch = Range("a1") ' This is now passed as a parameter into the Function
Dim entityRange As Range
With Sheet2.QueryTables.Add( _
Connection:="URL;http://www.abr.business.gov.au/SearchByABN.aspx?SearchText=" & strSearch & "&safe=active", _
Destination:=Sheet2.Range("A1")) ' Change this destination to Sheet2
.BackgroundQuery = True
.TablesOnlyFromHTML = True
.Refresh BackgroundQuery:=False
.SaveData = True
End With
' Find the Range that has "Entity Type:"
Set entityRange = Sheet2.UsedRange.Find("Entity type:")
' Then return the value of the cell to its' right
URL_Get_ABN_Query = entityRange.Offset(0, 1).Value2
' Clear Sheet2 for the next run
Sheet2.UsedRange.Delete
End Function
You do not want a load of connections (queryTables) set up in this way. It will be so slow if even possible. At 8000 requests, provided xmlhttp is not blocked or throttled, the below method will be significantly faster. If there does appear to be slowing/blocking then add in a small wait every x requests.
If possible use xmlhttp to gather data. Use css selectors to specifically target the entity type. Store values in an array and write out with loop at end. Use a class to hold the xmlhttp object for greater efficiency. Provide your class with methods including how to handle not found (example given). Add some further optimizations e.g. given is switching off screen-updating. This assumes your search numbers are in column B from B2. The code below also does some basic checks that there is something present in column B and handles the case of there being 1 or more numbers.
Good code is modular and you want a function to return something and a sub to perform actions. A single sub/function shouldn't complete lots of tasks. You want to easily debug with code that follows the principle of single responsibility (or close to it).
class clsHTTP
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetHTML(ByVal URL As String) As String
Dim sResponse As String
With http
.Open "GET", URL, False
.send
GetHTML = StrConv(.responseBody, vbUnicode)
End With
End Function
Public Function GetEntityType(ByVal html As HTMLDocument) As String
On Error GoTo errhand:
GetEntityType = html.querySelector("a[href*='EntityTypeDescription']").innerText
Exit Function
errhand:
GetEntityType = "Not Found"
End Function
Standard module:
Option Explicit
Public Sub GetInfo()
Dim http As clsHTTP, sResponse As String, lastRow As Long, groupResults(), i As Long, html As HTMLDocument
Set html = New HTMLDocument
Set http = New clsHTTP
Const BASE_URL As String = "http://www.abr.business.gov.au/ABN/View/"
Application.ScreenUpdating = False
With ThisWorkbook.Worksheets("Sheet1")
lastRow = .Cells(.rows.Count, "B").End(xlUp).Row
Select Case lastRow
Case 1
Exit Sub
Case 2
ReDim arr(1, 1): arr(1, 1) = .Range("B2").Value
Case Else
arr = .Range("B2:B" & lastRow).Value
End Select
ReDim groupResults(1 To lastRow - 1)
With http
For i = LBound(arr, 1) To UBound(arr, 1)
If Len(BASE_URL & arr(i, 1)) > Len(BASE_URL) Then
sResponse = .GetHTML(BASE_URL & arr(i, 1))
html.body.innerHTML = sResponse
groupResults(i) = .GetEntityType(html)
sResponse = vbNullString: html.body.innerHTML = vbNullString
End If
Next
End With
For i = LBound(groupResults) To UBound(groupResults)
.Cells(i + 1, "C") = groupResults(i)
Next
End With
Application.ScreenUpdating = True
End Sub
References (VBE> Tools > References):
Microsoft HTML Object Library
CSS selectors:
I use the fact the entity description is a hyperlink (a tag) and that its value contains the string EntityTypeDescription to use a css attribute = value with contains (*) operator to target.