I am trying to grab a text element from a page. To get to this element my scrips clicks on two filters on the page. I need to crawl 5,000 pages. The script works, in terms of collecting the text element, however, after a certain number of pages it always returns a message "element not visible". I am assuming it's due to the fact that page didn't load in time, since I checked the pages where it breaks and the text element is there. (I have time.sleep(3) already implemented after every click). What can I use in my script to just skip that page if it doesn't load in time?
def yelp_scraper(url):
driver.get(url)
# get total number of restaurants
total_rest_loc = '//span[contains(text(),"Showing 1")]'
total_rest_raw = driver.find_element_by_xpath(total_rest_loc).text
total_rest = int(re.sub(r'Showing 1.*of\s','',total_rest_raw))
button1 = driver.find_element_by_xpath('//span[#class="filter-label filters-toggle js-all-filters-toggle show-tooltip"]')
button1.click()
time.sleep(1)
button2 = driver.find_element_by_xpath('//span[contains(text(),"Walking (1 mi.)")]')
button2.click()
time.sleep(2)
rest_num_loc = '//span[contains(text(),"Showing 1")]'
rest_num_raw = driver.find_element_by_xpath(rest_num_loc).text
rest_num = int(re.sub(r'Showing 1.*of\s','',rest_num_raw))
if total_rest==rest_num:
button3 = driver.find_element_by_xpath('//span[contains(text(),"Biking (2 mi.)")]')
button3.click()
time.sleep(2)
button4 = driver.find_element_by_xpath('//span[contains(text(),"Walking (1 mi.)")]')
button4.click()
time.sleep(2)
rest_num_loc = '//span[contains(text(),"Showing 1")]'
rest_num_raw = driver.find_element_by_xpath(rest_num_loc).text
rest_num = int(re.sub(r'Showing 1.*of\s','',rest_num_raw))
return(rest_num)
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
chrome_options = Options()
# add headless mode
chrome_options.add_argument("--headless")
# turn off image loading
prefs = {"profile.managed_default_content_settings.images":2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
for url in url_list:
yelp_data[url] = yelp_scraper(url)
json.dump(yelp_data, open('../data/yelp_json/yelp_data.json', 'w'), indent="\t")
driver.close()
EXAMPLE:
from selenium.common.exceptions import NoSuchElementException
for item in driver.find_elements_by_class_name('item'):
try:
model = item.find_element_by_class_name('product-model')
price = item.find_element_by_class_name('product-display-price')
title = item.find_element_by_class_name('product-title')
url = item.find_element_by_class_name('js-detail-link')
items.append({'model': model, 'price': price, 'title': title, 'url': url})
print (model.text, price.text, title.text, url.get_attribute("href"))
c = (model.text, price.text, title.text, url.get_attribute("href"))
a.writerow(c)
except NoSuchElementException:
#here you can do what you want to do when an element is not found. Then it'll continue with the next one.
b.close()
Related
for x in range(0,5,2):
driver.get(urls[x])
driver.find_element(By.CLASS_NAME, "landscape").click() #driver.find_element(By.XPATH, "/html/body/div[1]/div[10]/div[1]/div[3]/div/div[3]/a/span/img").click() time.sleep(10)
html = driver.page_source
s_redfin = BeautifulSoup(html, 'html.parser')
photo_count_text = s_redfin.find('div', {'class':'PagerIndex'}).text.split('<!-- -->')[0].split('')
photo_count = int(photo_count_text[2])
for num in range(0, photo_count):
next=driver.find_element(By.CSS_SELECTOR, "div.nav:nth-child(3)").click()
time.sleep(5)
images = driver.find_elements(By.XPATH, "//img[contains(#class,'inline-block')]")
src = []
for img in images:
src.append(img.get_attribute('src'))
driver.find_element(By.CSS_SELECTOR, "svg.close:nth-child(3)").click()
time.sleep(10)
driver.find_element(By.CSS_SELECTOR, ".backButton > svg:nth-child(1)").click()
time.sleep(10)
# Put the photos in named folders that connect with the data
for i in range(len(src)):
urllib.request.urlretrieve(str(src[i]),"Houses/house{}.jpg".format(i))
# go back to main page
try:
WebDriverWait(driver, 20).until(driver.find_element(By.CSS_SELECTOR, ".backButton > svg:nth-child(1)").click())
except NoSuchElementException:
print("exception handled")
to know why my for loop is not appending new image of every iteration of "for x in range(0,5,2):" but overwrite it in urllib.request.urlretrieve(str(src[i]),"Houses/house{}.jpg".format(i)).
for x in range(0,5,2):
driver.get(urls[x])
driver.find_element(By.CLASS_NAME, "landscape").click()
#driver.find_element(By.XPATH, "/html/body/div[1]/div[10]/div[1]/div[3]/div/div[3]/a/span/img").click()
time.sleep(10)
html = driver.page_source
s_redfin = BeautifulSoup(html, 'html.parser')
photo_count_text = s_redfin.find('div', {'class':'PagerIndex'}).text.split('<!-- -->')[0].split('')
photo_count = int(photo_count_text[2])
for num in range(0, photo_count):
next=driver.find_element(By.CSS_SELECTOR, "div.nav:nth-child(3)").click()
time.sleep(5)
images = driver.find_elements(By.XPATH, "//img[contains(#class,'inline-block')]")
src = []
for img in images:
src.append(img.get_attribute('src'))
driver.find_element(By.CSS_SELECTOR, "svg.close:nth-child(3)").click()
time.sleep(10)
driver.find_element(By.CSS_SELECTOR, ".backButton > svg:nth-child(1)").click()
time.sleep(10)
# Put the photos in named folders that connect with the data
for i in range(len(src)):
urllib.request.urlretrieve(str(src[i]),"Houses/house{}.jpg".format(i))
# go back to main page
try:
WebDriverWait(driver, 20).until(driver.find_element(By.CSS_SELECTOR, ".backButton > svg:nth-child(1)").click())
except NoSuchElementException:
print("exception handled")
instead of appending the image in the house folder, it overwrite.don't see the issue.
have the loop to append on every iteration
the content of the floating window is not included in the web code. it only appears when the mouse is over it. So how can I obtain the value in this situation? I find it tricky cuz there's no anchor point for me to control the movement of the mouse.
The code is like below,
def button_click(driver, button_num):
driver.execute_script("arguments[0].click();", button_num)
def catogory_obtain_tokyo(driver):
time_waiting_max = 20
try:
page_kansai = WebDriverWait(driver, time_waiting_max).until(
EC.presence_of_element_located((By.ID, 'snippet-13'))
)
buttons = WebDriverWait(page_kansai, time_waiting_max).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "mv-button-base.mv-hyperlink-button"))
)
return buttons
except:
print('catogory_obtain error')
driver.quit()
return ''
path = r'chromedriver.exe'
tokyo_url = r'https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%22EEX%20Japanese%20Power%20Futures%20-%20Tokyo%22%7D'
# --- time line ---
timeline = '//*[#id="null"]/div/div[2]/div'
# ------- price trade reg ----
pane_pr = '//*[#id="null"]/div/div[1]/div[1]/div[2]'
# --------volume trade registration ------
pane_vtr = '//*[#id="null"]/div/div[1]/div[3]/div[2]'
driver = webdriver.Chrome(path)
driver.get(tokyo_url)
btns = catogory_obtain_tokyo(driver)
button_click(driver, btns[0])
time.sleep(3)
# sep-03 btn
date = '//*[#id="symbolheader_jft"]/div/div[1]/div[2]/table/tbody/tr[1]/td[5]'
date_btn = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, date))
)
time.sleep(5)
date_btn.click()
# hit icon
icon_path = '//*[#id="baseloadwidget_jft"]/table/tbody/tr[2]/td[5]'
icon = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, icon_path))
)
time.sleep(5)
icon.click()
time.sleep(5)
# --------- click volume btn ------
vtr_path = '//*[#id="baseloadwidget_jft"]/table/tbody/tr[3]/td/div/div[2]/div[3]/div[2]'
vtr_btn = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, vtr_path))
)
time.sleep(5)
vtr_btn.click()
time.sleep(5)
tl = driver.find_element(By.XPATH, timeline)
webdriver.ActionChains(driver).move_to_element(tl).perform()
time.sleep(5)
pr = driver.find_element(By.XPATH, pane_pr)
webdriver.ActionChains(driver).move_to_element(pr).perform()
time.sleep(5)
vtr = driver.find_element(By.XPATH, pane_vtr)
webdriver.ActionChains(driver).move_to_element(vtr).perform()
time.sleep(5)
time.sleep(5)
driver.quit()
Basically, I tried the move_to_element method, but it only moves the mouse to the center of element. However, here this inline chart is considered as one element, how can I control the movement of the mouse within one single web element?
I hope this is answer you are looking for.
from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%22EEX%20Japanese%20Power%20Futures%20-%20Tokyo%22%7D")
time.sleep(30) # Manually selected the options.
blocks = driver.find_elements_by_xpath("//div[#class='mv-panes-host']/div[3]/div[2]//*[local-name()='svg']/*[name()='g'][2]//*[name()='rect']")
actions = ActionChains(driver)
for block in blocks:
actions.move_to_element(block).perform()
time.sleep(2)
print(driver.find_element_by_xpath("//div[#id='null']/div/div[1]/div[3]/div[2]/div/div[contains(#class,'date')]").text)
print(driver.find_element_by_xpath("//div[#id='null']/div/div[1]/div[3]/div[2]/div//div[contains(#class,'name')]").text)
print(driver.find_element_by_xpath("//div[#id='null']/div/div[1]/div[3]/div[2]/div//div[contains(#class,'value')]").text)
driver.quit()
9/3/2021, 01:53:23 PM
Volume Trade Registration
840.000
9/3/2021, 01:56:26 PM
Volume Trade Registration
840.000
I am running a scraper that get the data from Nowgoal. It was running fine till this morning. I run it again without any changes in the program, it showed me an error "driver is not defined".
However, I have already defined as follows:
options = webdriver.ChromeOptions() #initialise webdriver options
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options)
I am not sure what exactly is the problem, the error directed to the last line of the program where I quit the driver as follow:
driver.quit()
It happened a few times, I closed all the IDLE and opened it again, it worked. But now, no matter what it got me the same error.
Below is the detail code and the Link_output file is here to get the main URLs.
#######Main script start here - open the file where the urls has been stored-make sure Link_output is in same folder as script running######
read_workbook = load_workbook(filename="Link_output.xlsx")
read_sheet = read_workbook.active
for row in range(2,read_sheet.max_row+1):
for column in "BDG": #Here you can add or reduce the columns
cell_name = "{}{}".format(column, row)
main_urls.append(read_sheet[cell_name].value)
#we have URL ready
print('Urls we are going to scrape : ' ,main_urls)
#filter out the dictionary based on bookmakers entered in config file- we will have the bookmakers available in my_dictionay_button
wanted_bookmaker_lst = bkmaker_param.split(',')
for maker in wanted_bookmaker_lst:
for k,v in main_dictionary_button.items():
if k.startswith(maker):my_dictionary_button[k]=v
#now loop through each URL
for file_key ,main_url in enumerate(main_urls):
#start the new workbook for each new URL
workbook = Workbook()
#Error flag clear - first time action flag also cleared
i=0
error_flag =0
file_key += 1
#Main url -print here-every third value is the link
if file_key % 3 == 0:
print(main_url)
# first we will enter into main_url - urls generally open with Crown tab - so we will click through each button of the bookmakers
for bookmaker ,odds_url_button in my_dictionary_button.items():
if i == 0 and error_flag == 0 :#first time action
#start the driver for the first time
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options)
#driver = webdriver.Chrome(executable_path = driver_path,options=options )
try:
driver.get(main_url) #Get the main url
except TimeoutException:
driver.get(main_url) #in case of timeout error - try again
time.sleep(5)
try:
driver.find_element_by_xpath(odds_url_button).click() #click on the first bookmaker button
driver.switch_to.window(driver.window_handles[0]) #in case any pop up is opening due to any reason - swtich to main window
except NoSuchElementException: #In case button is not found
print('Button not found')
lst_of_reattempt.append(driver.current_url) #Get the current url for which we were not able to find the button
saved_button_for_halftime = odds_url_button #save the button for later reattempt
driver.quit()
i+=1 #Firt time actions are over
error_flag == 1 #initialise the error count
continue
i+=1
elif error_flag == 1: #if previous one went into error
if odds_url_button == '//*[#id="htBtn"]/a': #In case the error happened while clicking on half time button
half_time = 1
revised_url = get_url_from_button(saved_button_for_halftime,main_url,half_time)# Get the revised url
userAgent = ua.random #change user agent everytime browser went into error
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options) #trigger driver
#driver = webdriver.Chrome(executable_path = driver_path,options=options )
try:
driver.get(revised_url) #Revised URL open
time.sleep(5)
except TimeoutException:
driver.get(revised_url) #In case of timeout- reattempt
time.sleep(5)
error_flag = 0 #disable error flag - so we can proceed as usual
else:
revised_url = get_url_from_button(odds_url_button,main_url)
userAgent = ua.random
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options)
#driver = webdriver.Chrome(executable_path = driver_path,options=options )
try:
driver.get(revised_url)
except TimeoutException:
driver.get(revised_url)
error_flag = 0
else: #In case of no error
driver.find_element_by_xpath(odds_url_button).click()#Click on next button
driver.switch_to.window(driver.window_handles[0]) #in case any pop up is opening due to any reason - swtich to main window
i+=1
time.sleep(random.randint(5,7)) #sleep for random amount of time - to make the script robust
htmlSource = driver.page_source #Get the html code
soup = bs4.BeautifulSoup(htmlSource,'html.parser') #pass the page
#get the fixed data which is common and do not change for one book maker
title, home_team , away_team , place, weather ,tournament,m_date,m_time,data_first,data_second,data_third,final_score = get_fixed_data(soup)
#home team ranking
home_team_ranking = main_urls[file_key-3]
away_team_ranking = main_urls[file_key-2]
print('Title data :',title)
if title != 'No Data':#check if the data found or not
#create the folder path
print(m_date)
folder_month ,folder_day ,folder_year = m_date.split('-') #/
folder_hour ,folder_minute = m_time.split(':')
#fle_name = folder_day +folder_month + folder_year
#folder_name = folder_day +'_'+folder_month+'_' + folder_year
#convert the time to gmt
folder_time_string = folder_year +'-'+folder_month +'-'+folder_day +' '+ folder_hour+':'+folder_minute+':00'
#folder name change
folder_name =time.strftime("%d-%m-%Y", time.gmtime(time.mktime(time.strptime(folder_time_string, "%Y-%d-%m %H:%M:%S"))))
print(bookmaker)
#Output_file_format
try:
print('Creating directory')
os.mkdir(os.path.join(os.getcwd()+'\\'+folder_name))
except FileExistsError:
print('Directory already exist')
inter_file_name = 'Odds_nowgoal_'+str(title.replace('v/s','vs'))+'_'+folder_name+'.xlsx'
ola = os.path.join('\\'+folder_name,inter_file_name)
output_file_name = os.path.join(os.getcwd()+ola)
#sheet_title_first_table
sheet_title = '1X2 Odds_'+bookmaker
#add data to excel
excel_add_table(sheet_title,data_first,title,home_team , away_team , place, weather ,tournament,m_date,m_time,bookmaker,home_team_ranking,away_team_ranking,final_score)
#sheet_title_second_table
sheet_title = 'Handicap Odds_'+bookmaker
#add data to excel
excel_add_table(sheet_title,data_second,title,home_team , away_team , place, weather ,tournament,m_date,m_time,bookmaker,home_team_ranking,away_team_ranking,final_score)
#sheet_title_third_table
sheet_title = 'Over_Under Odds_'+bookmaker
#add data to excel
excel_add_table(sheet_title,data_third,title,home_team , away_team , place, weather ,tournament,m_date,m_time,bookmaker,home_team_ranking,away_team_ranking,final_score)
else :
lst_of_reattempt.append(home_team_ranking)
lst_of_reattempt.append(away_team_ranking)
lst_of_reattempt.append(driver.current_url) #add the url into list of reattempt
saved_button_for_halftime = odds_url_button #save the button when error happens - so we can convert it into URL and later reattempt
error_flag = 1
driver.quit() #Quit the driver in case of any error
driver.quit()
I want to scrape data from following website :
http://b2b.godrejinterio.com/GodrejInterio/dealer.aspx?id=29&menuid=2458&business=2
Here, data is dynamically generated on the same page itself without any change in URL.
Everytime you option from 1st dropdown menu, then only 2nd dropdown becomes active and allows you to select option from 2nd dropdown and so on for 3rd & 4th dropdown menu.
After selection of all the dropdown menus, you have to click on search button then only data gets generated on the same page.
I need to scrape data for all possible selections in one go. Below is the code which i tried but it wont work as desired. I am using python and tools as beautifulsoup & selenium. Help me with this!!
Mike67, I have used your suggestion and improved code, but still I am unable to iterate within option and save code to dataframe. Help me with this !!
Code :
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome("C:/Users/Downloads/chromedriver")
rec=[]
driver.get("http://b2b.godrejinterio.com/GodrejInterio/dealer.aspx?id=29&menuid=2458&business=2")
# wait=WebDriverWait(driver,10)
time.sleep(2)
s1 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlRange"))
s1.select_by_value("Institutional Furniture")
# print(s1.options[0].text)
time.sleep(2)
# wait.until(EC.presence_of_all_element_located((By.ID,"ucInterioDealerLocatorNewRight_ddlRange")))
s22 = driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlSubRange")
s2 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlSubRange"))
all_options1 = s22.find_elements_by_tag_name("option")
for option1 in all_options1:
option1=option1.get_attribute("value")
print(option1)
if(option1=='0'):
continue
else:
s2.select_by_value(option1)
time.sleep(10)
# wait.until(EC.presence_of_all_element_located((By.ID,"ucInterioDealerLocatorNewRight_ddlSubRange")))
s33 = driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlState")
s3 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlState"))
all_options2 = s33.find_elements_by_tag_name("option")
for option2 in all_options2:
option2=option2.get_attribute("value")
print(option2)
s3.select_by_value(option2)
# print(s3.options[1].text)
time.sleep(10)
# wait.until(EC.presence_of_all_elements_located((By.ID,"ucInterioDealerLocatorNewRight_ddlState")))
s44 = driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlCity")
s4 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlCity"))
all_options3 = s44.find_elements_by_tag_name("option")
for option3 in all_options3:
option3=option3.get_attribute("value")
print(option3)
if(option3=='0'):
continue
else:
s4.select_by_value(option3)
# print(s4.options[1].text)
time.sleep(10)
# wait.until(EC.presence_of_all_elements_located((By.ID,"ucInterioDealerLocatorNewRight_ddlCity")))
s55 = driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlArea")
s5 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlArea"))
all_options4 = s55.find_elements_by_tag_name("option")
for option4 in all_options4:
option4=option4.get_attribute("value")
print(option4)
if(option4=='0'):
continue
else:
s5.select_by_value(option4)
# print(s4.options[1].text)
time.sleep(10)
s6=driver.find_element_by_id("ucInterioDealerLocatorNewRight_imgBtnSearch").click()
# for i in s6.find_all('div')
# print(type(s6))
# print(s4.content)
time.sleep(10)
# wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,"dealer_search_maindiv")))
# r1 = driver.find_element_by_class_name("dealer_search_maindiv")
html=driver.page_source
# print(html)
soup=BeautifulSoup(html,'html5lib')
try:
cl=soup.find('div',attrs={'class':'dealer_search_maindiv'})
for i in range(0,10):
i=str(i)
idd= f"ucInterioDealerLocatorNewRight_dlDealer_ctl0{i}_tblDealer"
kwargs={'id': 'idd' }
kwargs['id'] = idd
d1=cl.find('table', kwargs)
data=";"
d2 = d1.find('table')
for d3 in d2.find_all('tr'):
j=d3.find('td').text
print(j)
data = data + j + ';'
print(data)
rec.append(data)
except:
print("no record for this selection")
continue
print("state done")
print("all subrange completed")
print(len(rec))
df=pd.DataFrame({'Record':rec})
driver.close()
If you call time.sleep in between each dropdown change, the page works:
driver.get("http://b2b.godrejinterio.com/GodrejInterio/dealer.aspx?id=29&menuid=2458&business=2")
time.sleep(2)
s1 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlRange"))
s1.select_by_value("Institutional Furniture")
print(s1.options[0].text)
time.sleep(2)
s2 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlSubRange"))
s2.select_by_value("Desking")
time.sleep(2)
s3 = Select(driver.find_element_by_id("ucInterioDealerLocatorNewRight_ddlState"))
s3.select_by_value("Delhi")
print(s3.options[0].text)
driver.find_element_by_id("ucInterioDealerLocatorNewRight_imgBtnSearch").click()