VBA copy html table data to excel worksheet - excel

I need a VBA script that can extract local html table data to an Excel worksheet. I have some code (found it somewhere on the web) that works by using a URL link, but what I want is to be able to do it using my locally stored html file. The error is I get is 'app defined or object defined error'.
Sub HTML_Table_To_Excel()
Dim htm As Object
Dim Tr As Object
Dim Td As Object
Dim Tab1 As Object
'Replace the URL of the webpage that you want to download
Web_URL = "http://espn.go.com/nba/"
'Create HTMLFile Object
Set HTML_Content = CreateObject("htmlfile")
'Get the WebPage Content to HTMLFile Object
With CreateObject("msxml2.xmlhttp")
.Open "GET", Web_URL, False
.send
HTML_Content.body.innerHTML = .responseText 'this is the highlighted part for the error
End With
Column_Num_To_Start = 1
iRow = 2
iCol = Column_Num_To_Start
iTable = 0
'Loop Through Each Table and Download it to Excel in Proper Format
For Each Tab1 In HTML_Content.getElementsByTagName("table")
With HTML_Content.getElementsByTagName("table")(iTable)
For Each Tr In .Rows
For Each Td In Tr.Cells
Sheets(1).Cells(iRow, iCol).Select
Sheets(1).Cells(iRow, iCol) = Td.innerText
iCol = iCol + 1
Next Td
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tr
End With
iTable = iTable + 1
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tab1
MsgBox "Process Completed"
End Sub

I had the same problem and to solve it I used the original code of the question, but instead of downloading the html, I opened the html as a text file and the result was passed to the object HTML_Content.body.innerHtml the rest of the code is same.
Sub HTML_Table_To_Excel()
Dim htm As Object
Dim Tr As Object
Dim Td As Object
Dim Tab1 As Object
Dim file as String
'Replace the file path with your own
file = "c:\your_File.html"
'Determine the next file number available for use by the FileOpen function
TextFile = FreeFile
'Open the text file
Open file For Input As TextFile
'Create HTMLFile Object
Set HTML_Content = CreateObject("htmlfile")
HTML_Content.body.innerHtml = Input(LOF(TextFile), TextFile)
Column_Num_To_Start = 1
iRow = 2
iCol = Column_Num_To_Start
iTable = 0
'Loop Through Each Table and Download it to Excel in Proper Format
For Each Tab1 In HTML_Content.getElementsByTagName("table")
With HTML_Content.getElementsByTagName("table")(iTable)
For Each Tr In .Rows
For Each Td In Tr.Cells
Sheets(1).Cells(iRow, iCol).Select
Sheets(1).Cells(iRow, iCol) = Td.innerText
iCol = iCol + 1
Next Td
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tr
End With
iTable = iTable + 1
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tab1
MsgBox "Process Completed"
End Sub

Not sure if i've followed the conventions, but i somehow managed to get an HTML table exported to excel successfully. Here's my vb script. Any optimizations/corrections are welcome! Thanks.
Sub Export()
rowsLength =document.all.yourHTMLTableId.rows.length
cellLength= (document.all.yourHTMLTableId.Cells.length/rowsLength) 'Because i dont know how to get no.of cells in a row,so used a simple division
Set crr = CreateObject("WScript.Shell")
fileNm= "Export"
dir= crr.CurrentDirectory&"\"&fileNm&".xlsx"
Set objExcel = CreateObject("Excel.Application")
Set objWorkbook = objExcel.Workbooks.Add()
Set objWorksheet= objWorkbook.Worksheets(1)
i=0
j=0
do until i=rowsLength
do until j=cellLength
objWorksheet.cells(i+1,j+1).value = document.all.yourHTMLTableId.rows(i).cells(j).innerHTML
msgbox document.all.yourHTMLTableId.rows(i).cells(j).innerHTML
j= j+1
Loop
j=0
i=i+1
Loop
objWorkbook.SaveAs(dir)
objWorkbook.close
objExcel.Quit
Set objExcel = Nothing
End Sub

Related

VBA Macro- How to scrape these tables from website

I am trying to fetch all web tables but I am not able to fetch them. I have tried same program for other website and its working but for this particular website its not working at all. I have used other api url as well but its not fetching data properly
Sub Export_HTML_Table_To_Excel()
Dim htm As Object
Dim Tr As Object
Dim Td As Object
Dim Tab1 As Object
'Replace the URL of the webpage that you want to download
Web_URL = "https://www.indiaratings.co.in/pressrelease/60901"
'Web_Url="https://www.indiaratings.co.in/pressReleases/GetPressreleaseData pressReleaseId=60901&uniqueIdentifier=122.186.172.34-20230209"
'Create HTMLFile Object
Set HTML_Content = CreateObject("htmlfile")
'Get the WebPage Content to HTMLFile Object
With CreateObject("msxml2.xmlhttp")
.Open "GET", Web_URL, False
.send
HTML_Content.body.innerHTML = .responseText
End With
Column_Num_To_Start = 1
iRow = 2
iCol = Column_Num_To_Start
iTable = 0
'Loop Through Each Table and Download it to Excel in Proper Format
For Each Tab1 In HTML_Content.getElementsByTagName("table")
With HTML_Content.getElementsByTagName("table")(iTable)
For Each Tr In .Rows
For Each Td In Tr.Cells
Worksheets("ABC").Cells(iRow, iCol).Select
Worksheets("ABC").Cells(iRow, iCol) = Td.innerText
iCol = iCol + 1
Next Td
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tr
End With
iTable = iTable + 1
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tab1
MsgBox "Process Completed"
End Sub
Kindly guide me for how to fetch this data properly

Exiting Word Table after pasting from Excel

I'm trying to copy information from an excel sheet to a new word document. Currently everything copies correctly on the first loop, but pastes into the previously pasted table in the next loop. I've tried every variation of ways to exit the table I can find through searching and none seem to fix the issue. Hoping someone can help.
Sub createWord()
Dim objWord
Dim objDoc
Dim heading As New DataObject
Dim fileName As String
Dim tableRange As Word.Range
Set objWord = CreateObject("Word.Application")
Set objDoc = objWord.Documents.Add()
fileName = ActiveWorkbook.Name
fileName = Left$(fileName, InStrRev(fileName, ".") - 1) & " Data.doc"
'objDoc.SaveAs fileName:=ThisWorkbook.Path & "\" & fileName
objWord.Visible = True
For i = 4 To Application.Sheets.Count
Dim k As Integer
k = ((i - 4) * 4) + 1
heading.SetText ThisWorkbook.Worksheets(i).Cells(1, 4).Value
heading.PutInClipboard
objDoc.Paragraphs.Add
objDoc.Paragraphs(k).Range.Paste
k = k + 1
Call copyGraphAuto(i)
objDoc.Paragraphs.Add
objDoc.Paragraphs(k).Range.Paste
k = k + 1
heading.SetText ThisWorkbook.Worksheets(i).Cells(24, 5).Value
heading.PutInClipboard
objDoc.Paragraphs.Add
objDoc.Paragraphs(k).Range.Paste
k = k + 1
Call copyTableAuto(i)
objDoc.Paragraphs.Add
objDoc.Paragraphs(k).Range.Paste
Set tableRange = objDoc.Tables(k - 3).Range
tableRange.Collapse Direction:=wdCollapseEnd
'Exit For
Next i
End Sub
Sub copyTableAuto(Optional ByVal sheetNumber As Integer)
Dim ppmCount As Integer
If sheetNumber = 0 Then sheetNumber = ThisWorkbook.ActiveSheet.Index
ppmCount = Worksheets(sheetNumber).Range("M4:M9").Cells.SpecialCells(xlCellTypeConstants).Count
Application.ScreenUpdating = False
Application.DisplayAlerts = False
Worksheets(sheetNumber).Range("E29:E" & CStr(ppmCount + 28)).Merge
Worksheets(sheetNumber).Range("E25:I" & CStr(ppmCount + 28)).Copy
End Sub
Thanks
The issue is caused by your attempt to maintain an index of the paragraphs in the document.
As you are adding data to the document consecutively it would be better, and simpler, to use Word's own index and work with:
objDoc.Paragraphs.Last.Range

Export Web Text to excel vbs

Hi all i have been searching the web like mad and came up with the following vba code in excel the problem is it exports the table but not the text within the table it seems to only recover all text in the table that you can click on but not any set text.
If it will be easier for anyone to assist with their own code to do what i need it to help will be highly appreciated.
Sub my_Procedure()
Dim htm As Object
Dim Tr As Object
Dim Td As Object
Dim Tab1 As Object
Dim http As Object, html As New HTMLDocument
Dim paras As Object, para As Object, i As Long
Set http = CreateObject("MSXML2.XMLHTTP")
Set http = CreateObject("MSXML2.XMLHTTP")
http.Open "GET", "https://www.plus500.co.uk/?id=113082&tags=g_sr%2B1711614451_cpi%2BUKSearchBrand_cp%2B70887725030_agi%2BPlus500Core_agn%2Bplus%20500_ks%2Bkwd-842162906_tid%2Be_mt%2Bc_de%2Bg_nt%2B_ext%2B1006989_loc%2BUURL&%D7%90&gclid=CjwKCAjw1cX0BRBmEiwAy9tKHqylty6Mz9TbIA5VzgOiqxOcWg7biR652Hg9tksIR97hlUuAHLZilhoCTq0QAvD_BwE", False
http.send
html.body.innerHTML = http.responseText
Set paras = html.getElementsByTagName("Tbody")
i = 1
For Each para In paras
ThisWorkbook.Worksheets("Sheet3").Cells(i, 1).Value = para.innerText
i = i + 1
Next
Dim Doc As HTMLDocument
'Replace the URL of the webpage that you want to download
Web_URL = VBA.Trim(Sheets(1).Cells(1, 1))
'Create HTMLFile Object
Set HTML_Content = CreateObject("htmlfile")
Dim tdd As String
'Get the WebPage Content to HTMLFile Object
With CreateObject("msxml2.xmlhttp")
.Open "GET", Web_URL, False
.send
HTML_Content.body.innerHTML = http.responseText
End With
Column_Num_To_Start = 1
iRow = 2
iCol = Column_Num_To_Start
iTable = 0
'Loop Through Each Table and Download it to Excel in Proper Format
For Each Tab1 In HTML_Content.getElementsByTagName("tbody")
With HTML_Content.getElementsByTagName("tbody")(iTable)
For Each Tr In .Rows
For Each Td In Tr.Cells
Sheets(1).Cells(iRow, iCol).Select
Sheets(1).Cells(iRow, iCol) = Td.innerText
iCol = iCol + 1
Next Td
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tr
End With
iTable = iTable + 1
iCol = Column_Num_To_Start
iRow = iRow + 1
Next Tab1
MsgBox "Process Completed"
Call StartTimer
End Sub

Exporting pdf to excel

In Adobe DC Pro, you have the option to export your pdf to excel 2003 spreadsheet by first going to "Export PDF", next by choosing "XML Spreadsheet 2003" and last by choosing the setting "Create Worksheet for each page."
I can't find any code that does this in excel vba.
My question is, how do I make excel complete this option through VBA? Just saving it as SaveAS FileFormat:=XlFileFormat.xlXMLSpreadsheet only makes it a xml spreadsheet, not "creating the worksheet for each page" that Adobe Pro makes.
Thanks
You can do it using the Adobe Acrobat 10.0 Type Library reference, that comes with Adobe Acrobat Pro.
Here is an example:
This code will open a PDF file and assign each PDF page to individual worksheets.
Option Explicit
Sub PDF_To_Excel()
Dim PDF_ As Acrobat.AcroPDDoc
Dim Hilight_Text As Acrobat.AcroHiliteList
Dim PDF_Page As Acrobat.AcroPDPage
Dim Page_Text As Acrobat.AcroPDTextSelect
Dim ws As Worksheet
Dim Count_Page As Long
Dim i As Long, j As Long, k As Long
Dim PDF_Text_Str As String
Dim Hold_Txt As Variant
Set PDF_ = New Acrobat.AcroPDDoc
Set Hilight_Text = New Acrobat.AcroHiliteList
Hilight_Text.Add 0, 32767
With PDF_
.Open "C:\ED5049PX2.pdf"
Count_Page = .GetNumPages
For i = 1 To Count_Page
PDF_Text_Str = ""
Set PDF_Page = .AcquirePage(i - 1)
Set Page_Text = PDF_Page.CreateWordHilite(Hilight_Text)
If Not Page_Text Is Nothing Then
With Page_Text
For j = 0 To .GetNumText - 1
PDF_Text_Str = PDF_Text_Str & .GetText(j)
Next j
End With
End If
Set ws = Worksheets.Add(, Worksheets(Sheets.Count))
With ws
.Name = "Page-" & i
If PDF_Text_Str <> "" Then
Hold_Txt = Split(PDF_Text_Str, vbCrLf)
For k = 0 To UBound(Hold_Txt)
PDF_Text_Str = CStr(Hold_Txt(k))
If Left(PDF_Text_Str, 1) = "=" Then PDF_Text_Str = "'" & PDF_Text_Str
.Cells(k + 1, 1).Value = PDF_Text_Str
Next k
Else
.Cells(1, 1).Value = "No text found in page " & i
End If
.Cells.Select
.Cells.EntireColumn.AutoFit
End With
Next
.Close
End With
MsgBox ("Done")
End Sub
I'm still have a problem with Page_Text is Nothing. Even though I have a pdf open, it seems like it doesn't read the Page_Text. Here is the code:
.Open "C:\User\test.pdf"
Count_Page=.GetNumPages
For I=1 to Count_Page
PDF_Text_Str=""
Set PDF_Page=.AcquirePage(i-1)
Set Page_Text=PDF_Page.CreateWordHilite(Hilight_Text)
If Not Page_Text is Nothing then
With Page_Text
For j=0 to .GetNumText -1
PDF_Text_Str=PDF_Text_Str & .GetText(j)
Next j
End With
End if
Thanks

Code not pulling data after first webpage into excel

I have the below code that will pull through data from a table on the first page of a website (Price, name, currency, change etc)
Public Sub GetTeamData()
Dim strWebAddress As String
Dim strH2AnchorContent As String
Dim IEDocument As MSHTML.HTMLDocument
Dim objH2 As MSHTML.HTMLHeaderElement
Dim obTable As MSHTML.HTMLTable
Dim objRow As MSHTML.HTMLTableRow
Dim objCell As MSHTML.HTMLTableCell
Dim lngRow As Long
Dim lngColumn As Long
' initialize some variables that should probably better be passed as paramaters or defined as constants
strWebAddress = "https://toolkit.financialexpress.net/santanderam"
dateNow = Now
bExitLoop = False
lngTimeoutInSeconds = 5
Do While Not bExitLoop
If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Do
Loop
' open page
Set IEDocument = GetIEDocument(strWebAddress)
If IEDocument Is Nothing Then
MsgBox "Timeout reached opening this address:" & vbNewLine & strWebAddress, vbCritical
Exit Sub
End If
Dim ButtonData As Variant
Set ButtonData = IEDocument.getElementsByClassName("paginator fe-paging-navContainer")
Dim button As MSHTML.HTMLLinkElement
For Each button In ButtonData
Debug.Print button.nodeName
button.Click
' retrieve anchor element
Set oTable = IEDocument.getElementById("Price_1_1")
Debug.Print oTable.innerText
' iterate over the table and output its contents
lngRow = 1
For Each objRow In oTable.Rows
lngColumn = 1
For Each objCell In objRow.Cells
Cells(lngRow, lngColumn) = objCell.innerText
lngColumn = lngColumn + 1
Next objCell
lngRow = lngRow + 1
Next
Next button
End Sub
My problem is that I cannot get the data to pull through from the next pages (1..7).
Can anyone please help with why the above wont pull data through from the next pages?
Thank you!
After the bit of code that opens the page, replace the rest of the code with the below code. You might have to tweak it a bit but it should go through all available pages:
' Set the object for 'Next' button
Dim oNext As Variant
Set oNext = IEDocument.getElementsByClassName("ui-paging-button ui-state-default ui-corner-all ui-paging-next")
' Loop to go through all pages
Dim bExitMLoop As Boolean: bExitMLoop = False
lngRow = 1
Do While Not bExitMLoop
' Get data from current page
Set oTable = IEDocument.getElementById("Price_1_1")
For Each objRow In oTable.Rows
lngColumn = 1
For Each objCell In objRow.Cells
Cells(lngRow, lngColumn) = objCell.innerText
lngColumn = lngColumn + 1
Next objCell
lngRow = lngRow + 1
Next
' Check if Next button is available
If oNext.Length = 0 Then
bExitMLoop = True
Else
oNext.Item(0).Click
' Wait for page to refresh (could check the ready state here as well)
dateNow = Now
bExitLoop = False
lngTimeoutInSeconds = 3
Do While Not bExitLoop
If Now > DateAdd("s", lngTimeoutInSeconds, dateNow) Then Exit Do
Loop
' Reset 'Next' button object
Set oNext = Nothing
Set oNext = IEDocument.getElementsByClassName("ui-paging-button ui-state-default ui-corner-all ui-paging-next")
End If
Loop

Resources