I want to ask some questions.
I am using Python 3.7.6, web driver and selenium to do web crawler
And then, I used Visual Studio Code to finish my web crawler, and I output a csv file.
I used "find_elements_by_xpath" to catch some information. The following image is my part code:
from datetime import date,datetime
from selenium import webdriver #載入webdriver
from selenium.webdriver.common.keys import Keys #載入按鍵
from bs4 import BeautifulSoup #載入BeautifulSoup工具
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import numpy as np
import xlrd
import csv
import codecs
import time
data = xlrd.open_workbook('B.xlsx')
table = data.sheets()[0]
print(table)
nrows = table.nrows
ncols = table.ncols
print(ncols)
print(nrows)
for i in range(1,nrows):
csv_post="Post_No_" + str(i) + ".csv"
with open(csv_post, 'a', newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['互動作者','發表時間','互動內容'])
print_link = table.cell_value(i,3)
print(i)
print(print_link)
driver_blank=webdriver.Chrome('./chromedriver') #使用chrome作為爬蟲輔助工具,把chromedriver載入進來
driver_blank.get(print_link)
time.sleep(1)
post_page_count = len(driver_blank.find_elements_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[4]/div[2]/div[2]/select/option"))
if(post_page_count != 0):
try_value=1
while(try_value):
try:
driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[2]")
print("測試顯示正常")
try_value=0
except NoSuchElementException as e:
print("測試顯示異常,現正刷新網頁")
driver_blank.refresh()
time.sleep(10)
print("總頁數:"+str(post_page_count))
table_rows=len(driver_blank.find_elements_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table"))
print("共有"+str(table_rows)+"個Table")
real_table_rows=table_rows+1
#only 1
post_author = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[1]/a")
post_content = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div")
post_time = driver_blank.find_element_by_xpath("/html/body/form/div[5]/div/div/div[2]/div[1]/div[5]/table[1]/tbody/tr[2]/td[2]/table/tbody/tr[4]/td/div[2]/span")
print("互動作者:"+post_author.text)
print("互動內容:")
print(post_content.text)
print("發表時間:"+post_time.text)
print("<<< --- >>>")
with open(csv_post, 'a', newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow([post_author.text,post_time.text,post_content.text])
enter image description here
The following is the forum post: (https://forumd.hkgolden.com/view.aspx?type=MB&message=7197409)
enter image description here
I want to catch text, emoji, and image.
I can catch only the text, but I cannot catch emoji and image.
I don't know what to do. Can anyone help me? Thank you.
Related
I am trying to write a web scraping job through aws lambda(python) and I am getting this error when I execute it.
Error
Message: unknown error: cannot find Chrome binary
How am I running:
I have downloaded chromedriver from this website and zipped code along with below python code.Please let me know if this way or do I need to make any modifications to my code?
https://chromedriver.storage.googleapis.com/index.html?path=111.0.5563.19/
import concurrent.futures
import requests
from selenium import webdriver
import os
import subprocess
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import boto3
from datetime import datetime
def scrape_data():
try:
years = [2023]
states = ["alnb"]
for state in states:
"""Creating s3 connection to write into state folder"""
for year in years:
url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(executable_path='./chromedriver', chrome_options=options)
driver.get(url)
elements = WebDriverWait(driver, 2).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
bankruptcy_element = soup.findAll('div',{"class": "panel-collapse collapse in","class": "panel-title","class": "panel-body","class":"panel panel-default","class": "panel-collapse collapse in"})
print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
data = []
for i in bankruptcy_element:
for xmlfile in i.findAll('a', href=True):
if ("pdf" in (xmlfile['href'])):
xmlfile['href']=xmlfile['href'].replace(".pdf","/mods.xml")
xmlfile['href']=xmlfile['href'].replace("/pdf","")
xmlfile['href']=xmlfile['href'].replace("/pkg/","/")
xmlfile['href']=xmlfile['href'].replace("/content","")
xmlfile['href']="https://www.govinfo.gov/metadata/granule"+xmlfile['href']
data.append(xmlfile['href'])
return data
except Exception as e:
pass
print(e)
def lambda_handler(event, context):
s3 = boto3.client('s3')
today_date=datetime.today().strftime('%Y-%m-%d')
s3.put_object(Bucket='w-zone', Key='Banktcy/'+today_date+"/xmlfiles.txt", Body=scrape_data())
#
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup as BSoup
import requests
import pandas as pd
from requests_html import HTMLSession
import time
import xlsxwriter
import re
import os
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
time.sleep(10)
bs_obj = BSoup(driver.page_source, 'html.parser')
# Scrape table content
table = bs_obj.find('table', {"f_tac table_bd draggable"})
rows = table.find_all('tr')
table_content = []
for row in rows[1:]:
cell_row = []
for cell in row.find_all('td'):
cell_row.append(cell.text.replace(" ", "").replace("\n\n", " ").replace("\n", ""))
table_content.append(cell_row)
header_content = []
for cell in rows[0].find_all('td'):
header_content.append(cell.text)
driver.close()
race_writer = pd.ExcelWriter('export path', engine='xlsxwriter')
df = pd.DataFrame(table_content, columns=header_content)
df.to_excel(race_writer, sheet_name='game1')
Hi All, I am trying to scrape the racing result from HKJC. When I was executing the code above, either one of the errors below happened:
No excel file is created
Df is not written to the excel file < an empty excel file is created
Say if I successfully scrape the result of game 1, I then amend the script to continue to scrape that of game 2, but it still gives me the result of game 1.
Appreciate if anyone could help.
I changed your script to the one below. The approach followed is to click through each of the relevant "Sha Tin" buttons (see range(1, len(shatin)-1)) and collect the race table data. Race tables are added to a list called "races". Finally, write each of the race tables to individual sheets in Excel (note you no longer need BeautifulSoup).
Add these to your list of imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Then:
urlpage = 'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2019/07/14&Racecourse=ST&RaceNo=1'
# Setup selenium
driver = webdriver.Firefox(executable_path = 'geckodriver path')
# get web page
driver.get(urlpage)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//table[#class='f_fs12 f_fr js_racecard']")))
shatin=driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
races=[]
for i in range(1, len(shatin)-1):
shatin = driver.find_elements_by_xpath("//table[#class='f_fs12 f_fr js_racecard']/tbody/tr/td")
#time.sleep(3)
#WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[#class='performance']")))
shatin[i].click()
table = pd.read_html(driver.find_element_by_xpath("//div[#class='performance']").get_attribute('outerHTML'))[0]
races.append(table)
with pd.ExcelWriter('races.xlsx') as writer:
for i,race in enumerate(races):
race.to_excel(writer, sheet_name=f'game{i+1}', index=False)
writer.save()
driver.quit()
Output:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup
I have a web page I have been developing and I am now stuck getting told syntax is wrong, I cannot find the error can someone please help me find my error? Here is my code for the database that is giving me my error:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from models import *
import os # File I/O
import time
import shutil
import glob
import configparser
config_parser = configparser.ConfigParser()
config_parser.read("config.ini")
pdownload_dir = os.path.abspath('./prism_downloads/')
dt = str(datetime.datetime.now())
filelist = glob.glob(download_dir + '/*.html')
dbpath = ('./db')
def db_Prism():
database.connect()
database.create_tables([Prism], safe=False)
database.close()
for root, dir, files in os.walk(pdownload_dir):
for file in files:
print(file)
file_markup = ''
with open(os.path.abspath(os.path.join(pdownload_dir, file)), 'r') as html:
file_markup = html.read()
if file_markup == '':
print('ERROR: File was not read')
print('Reading {0} into BS4'.format(file))
soup = BeautifulSoup(file_markup, 'html.parser')
print('File parsed')
data = []
table = soup.find('table')
rows = table.find_all('tr') # 18th row is header row
cols = rows[0].find_all('td')
cols = [ele.text.strip() for ele in cols]
database.connect()
for row in rows[0:]:
d = row.find_all('td')
d = [ele.text.strip() for ele in d]
data.append([ele for ele in d if ele]) # Get rid of empty values
Prism.create(pmt_id=(d[1]),
old_status=d[3],
new_status=(d[4]),
last_updated=float(d[5])
Line 96 database.close()
Now here is the error message from my console:
C:\Users\Documents\NetBeansProjects\BudgetHome>python prism.py
File "prism.py", line 96
database.close()
^
SyntaxError: invalid syntax
C:\Users\Documents\NetBeansProjects\BudgetHome>
I am trying to scrape a website and save the information using Python and Selenium. The scrape is simple, and only requires choosing the state and district in two dropdown menus, clicking a submit button, and reading and writing a table to a csv.
I am confident my packages are installed correctly and my program even works, but only some of the time. My guess is that without the proper Selenium driver 'waits', my program crashes because it can't find the correct css_selector. I'll post the program below, and if anyone has any suggestions on how to correctly incorporate Selenium driver 'waits', I would very much appreciate the help.
Thanks so much, and here's the program:
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
url = "https://myhpgas.in/myHPGas/HPGas/LocateDistributor.aspx"
driver.set_window_size(1120, 550)
driver.get(url);
time.sleep(5)
stateList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState")
options = stateList.find_elements_by_tag_name("option")
optionsList = []
for option in options:
optionsList.append(option.get_attribute("value"))
optionsList[1:len(optionsList)]
for optionValue in optionsList:
select = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlState"))
select.select_by_value(optionValue)
districtList = driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict")
distOptions = districtList.find_elements_by_tag_name("option")
distOptionsList = []
for distOption in distOptions: #iterate over the options, place attribute value in list
distOptionsList.append(distOption.get_attribute("value"))
for distOptionValue in distOptionsList[1:len(distOptionsList)]:
distSelect = Select(driver.find_element_by_css_selector("#ContentPlaceHolder1_ddlDistrict"))
distSelect.select_by_value(distOptionValue)
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnShowList').click()
data = []
for tr in driver.find_elements_by_css_selector('#ContentPlaceHolder1_gvDistributor'):
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
print(data)
dataRows = int(numpy.array(data).size / 7)
rowsTimesColumns = (dataRows * 7) -1
newArray = numpy.array(data)
outArray = newArray[0:rowsTimesColumns]
test = pandas.DataFrame(outArray.reshape(dataRows,7), columns=['no', 'distributor', 'address','contact1', 'contact2', 'contact3', 'map'])
file_path = 'Users/outpath' + '_' + optionValue + '_' + distOptionValue + '.csv'
test.to_csv(file_path, sep=',')
driver.back()
driver.back()
Can you tell me which line returns an error!? Also how about using xpaths!?
I couldn’t see the statement to implement explicit waits
WebDriverWait(driver, 30).until(EC.presence_of_element_located_by(By.CSS_SELECTOR,*your css selector*))