I'm trying to scraping multi-page amazon comments. My code is not capturing any of the parts I wanted to get.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.amazon.fr/AmazonBasics-600-sacs-d%C3%A9jections-canines-distributeur/product-reviews/B00NABTG60/ref=cm_cr_getr_d_paging_btm_next_"
amazon_reviews = []
for page in range(2, 5):
req = requests.get(url + str(page) + "?ie=UTF8&reviewerType=all_reviews&pageNumber=" + str(page))
soup = BeautifulSoup(req.text, "html.parser")
# Getting desired data from our parsed soup
reviews = soup.find_all('div', {'data-hook': 'review'})
for item in reviews:
client = item.find('a', {'data-hook': 'genome-widget'}).text.strip()
title = item.find('a', {'data-hook': 'review-title'}).text.strip()
date = item.find('span', {'data-hook': 'review-date'}).text.strip()
rating = item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()
text = item.find('span', {'data-hook': 'review-body'}).text.strip()
amazon_reviews.append(pd.DataFrame({'title': title, 'date': date, 'text': text, 'rating': rating, 'client': client}, index = [0]))
out = pd.concat(amazon_reviews, ignore_index = True)
My output:
ValueError: No objects to concatenate
You have to inject user-agent as headers parameter.
You can't invoke DataFrame inside for loop
client's element selection was a bit wrong
I've injected the pagination using dot format
Code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.amazon.fr/AmazonBasics-600-sacs-d%C3%A9jections-canines-distributeur/product-reviews/B00NABTG60/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber={page}"
headers={'user-agent':'Mozilla/5.0'}
amazon_reviews = []
for page in range(1, 5):
req = requests.get(url.format(page=page),headers=headers)
soup = BeautifulSoup(req.text, "html.parser")
# Getting desired data from our parsed soup
reviews = soup.find_all('div', {'data-hook': 'review'})
for item in reviews:
client = item.find('div', {'class': 'a-profile-content'}).get_text(strip=True)
#print(client)
title = item.find('a', {'class': 'review-title'}).text.strip()
#print(title)
date = item.find('span', {'data-hook': 'review-date'}).text.strip()
#print(date)
rating = item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()
#print(rating)
text = item.find('span', {'data-hook': 'review-body'}).text.strip()
#print(text)
amazon_reviews.append({'title': title, 'date': date, 'text': text, 'rating': rating, 'client': client})
df = pd.DataFrame(amazon_reviews)
print(df)
Output:
title ... client
0 Parfaits ... Client d'Amazon
1 Tellement pratique ... Karen M
2 Génial ... Constance Jourdain
3 Bon ... Bernardini
4 Très bon produit ... Floriane Le Loher
5 Produit simple et facile d'utilisation ... K4rm4_Ez
6 La solidité ... thierry
7 Sacs à dejection + dévidoir ... M&N ABK
8 Bon produit ... Christophe FRANCOIS
9 Bonne qualité ... Neuray Gabriel
10 Très bien pour déjection canine ... PELEGRIN ERIC
11 Bonne idée ... Marine
12 Sac de qualité ... Jennifer A
13 conforme et pratique et solide ... G pay
14 Génial. ... Alban
15 Impeccable ... Marina C.
16 Pratique aux bonnes dimensions ... YVES CALVEZ
17 Solide et taille ok pour un labrador ... magnésium
18 Très pratique ... Client d'Amazon
19 très bon article ... berger fabienne
20 pratique ... Laetitia Hermann
21 Indispensable ... ronin
22 Pratique ... SylM
23 Top ... Emilie Ouviere
24 Bonne qualité ... Manon
25 Parfait ... Nicolas
26 Top ... Simon
27 Crochet énervant ! ... Jabousan
28 TOUJOURS LE MEILLEUR ... FRANKL FAN
29 Très bon produit ... Ludo96ci
30 Top pour le prix ! ... AlanLB
31 Très bien ! ... Client d'Amazon
32 Solide ... Lambourg
33 Sacs solides mais très difficiles à détacher l... ... Client d'Amazon
34 Bon rapport qualité prix ... GUYET
35 Top ... Client d'Amazon
36 Livraison rapide ... Yann
37 Il fait le job ... Rod
38 Bon produit ... Anais D
39 Pratique ... mario D.
[40 rows x 5 columns]
Related
So I've been working on a script that a co-worker of mine made now i fixed some of his issues but i cannot seem to figure out why it only actually works when i run it in debugging mode in VSC even when i run it from a normal python shell it does not give me the output files that it does when running in debug mode does anyone know why? (Some Links and sensitive company data has been removed)
here is the code:
import requests
from requests.auth import HTTPBasicAuth
import json
import csv
import os
import pandas as pd
import datetime
import urllib3
from datetime import datetime, timedelta
#______________________________________________________________________________________
#main functie
def Main():
#http request met api account naar de export lijst Rapid7
urllib3.disable_warnings() #negeert de waarschuwingen van de self signed certificaten
url = "URL REMOVED"
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED))
#data opslaan van de http request in csv formaat
with open('downloaded.csv', 'wb') as csv_file:
csv_file.write(r.content)
#open het input bestand van AD
Filenameslist = "C:\Robert-Code\ComputerListForRapid7.json" #volledig pad naar het bestand toegevoegd
with open(Filenameslist) as f:
data = json.load(f)
#converteer json naar een CSV-besand
with open("computerlist.csv", "w") as f:
fieldnames = data[3].keys()
# haal de keys van de 3e regel, want soms is de eerste regel van de sourcefile leeg
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
#Ik gebruik de Pandas module om alleen de kolom "name" van het Rapid7 bestand te krijgen die ik nodig heb.
# Dan draai ik naam en adres om, zodat de naam in de eerste kolom staat. Hiervoor lees ik de CSV in een dataframe
dfR7 = pd.read_csv("downloaded.csv")
titles = list(dfR7.columns)
titles[0],titles[1] = titles[1],titles[0] #draai om, zodat name in de eerste kolom staat
dfR7 = dfR7[titles] # zet de kolommen en data goed in het object
dfR7.sort_values(["Name"], inplace = True)
dfR7.drop(columns=["Address","Operating System","Site","Exploits","Malware","Vulnerabilities","Risk","Last Scan","Assessed"], inplace=True)
dfR7["Name"] = dfR7["Name"].str.split('.').str[0] #strip het domein van het FQDN
dfR7["Name"] = dfR7["Name"].str.lower() # alles lowercase
#Pandas module om ook van het AD-betand 1 kolom "name" over te houden in het object dfAD zodat ik later kan vergelijken.
dfAD = pd.read_csv("computerlist.csv")
dfAD.drop(columns=["DNSHostName","OperatingSystem","IPAddress", "LastLogon"], inplace= True)
dfAD["Computer"] = dfAD["Computer"].str.lower()
#beide objecten opslaan in een csv-bestand deze te vergelijken
dfR7.to_csv("fr7.csv", index=False)
dfAD.to_csv("fAD.csv", index=False)
with open('fr7.csv', 'r') as t1, open('fAD.csv', 'r') as t2:
fileRapid = t1.readlines()
fileAD = t2.readlines()
#de bestanden fr7.csv en fad.csv vergelijken aan de hand van een for loop
# deze dan opslaan in update.csv
with open('update.csv', 'w') as outFile:
for line in fileAD:
if line not in fileRapid:
outFile.write(line)
#hier haal ik weer het oude bestand van AD erbij om deze zometeen te mergen met het net gemaakte update.csv bestand
# zodat ik alle nuttige kolommen weer heb
dfAD = pd.read_csv("computerlist.csv")
dfAD["Computer"] = dfAD["Computer"].str.lower()
dfAD.to_csv("f1AD.csv", index=False)
# merge functie van de Pandas module
data1 = pd.read_csv('update.csv')
data2 = pd.read_csv("f1AD.csv")
output1 = pd.merge(data1, data2,
on='Computer',
how='inner')
#opslaan naar TotalresultsAD.csv
output1.to_csv("totaldifferenceAD_R7.csv", index =False)
#met de datetime module maak ik een variabele: time met de dag van vandaag minus 30 dagen
time = datetime.today() - timedelta(60)
"lees 2 x het bestand in"
dfgood = pd.read_csv("totaldifferenceAD_R7.csv")
dfbad = pd.read_csv("totaldifferenceAD_R7.csv")
#dit outputbestand geeft de assets weer die een LastLogon hebben recenter dan 30 dagen geleden
dfgood['LastLogon'] = pd.to_datetime(dfgood['LastLogon'], errors = 'coerce') #errors = 'coerce' zorgt ervoor dat de foutieve waarden in de kolom LastLogon worden genegeerd
dfgood.sort_values(["LastLogon"], inplace = True)
dfnew = (dfgood['LastLogon'] >= time)
dfnew = dfgood.loc[dfnew]
#dit outputbestand geeft de assets weer die een LastLogon hebben ouder dan 30 dagen geleden
dfbad['LastLogon'] = pd.to_datetime(dfbad['LastLogon'], errors = 'coerce') #errors = 'coerce' zorgt ervoor dat de foutieve waarden in de kolom LastLogon worden genegeerd
dfbad.sort_values(["LastLogon"], inplace = True)
newdf2 = (dfbad['LastLogon'] < time)
newdf2 = dfbad.loc[newdf2]
#wegschrijven uiteindelijke bestanden
dfnew.to_csv("newer_than_60_days.csv",index =False)
newdf2.to_csv("older_than_60_days.csv",index =False)
#opschonen van de bestanden
os.remove("FAD.csv")
os.remove("fr7.csv")
os.remove("computerlist.csv")
os.remove("downloaded.csv")
os.remove("f1AD.csv")
os.remove("update.csv")
if __name__=="__main__":
Main() ```
Thanks in advance for any help
Because I don't have a high enough SO reputation, unfortunately I can't simply comment this and need to make it an 'Answer'.
Changing
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED))
to
r= requests.get(url,verify=False, auth=HTTPBasicAuth('REMOVED', 'REMOVED'))
will get the syntax highlighting all fixed up and may make it easier for someone smarter than me to assist you :)
Something that I've previously come across (primarily with web scraping packages) were functions that didn't play nicely with relative paths - perhaps changing them to absolute paths using os.path.abspath(".....") may help? It's a stab in the dark so that this 'Answer' actually has a potentially useful element to it, but may be an adjustment worth exploring.
i'm trying to scrape some data for training but I'm stuck.
I would like to scrape the date, not just the year, but I couldn't quite figure out how to do this for now.
Here's the segment I would like to scrape :
htmlscrape
And here's my script so far :
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
url = "https://www.senscritique.com/films/tops/top111"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
titles = []
years = []
notes = []
synopsys = []
infos = []
dates = []
movie_div = soup.find_all('div', class_ = 'elto-flexible-column')
for container in movie_div:
title = container.h2.a.text
titles.append(title)
year = container.h2.find('span', class_ = 'elco-date').text
year = year.replace('(', '')
year = year.replace(')', '')
years.append(year)
sy = container.find('p', class_ = 'elco-description').text
synopsys.append(sy)
note = float(container.div.a.text)
notes.append(note)
info = container.find('p', class_ = 'elco-baseline elco-options').text
#type = re.sub(r'[a-z]+', '', type)
infos.append(info)
soup = container.find('p', class_ = 'elco-baseline elco-options')
for i in soup:
i = soup.find('time')
dates.append(i)
print(dates[0])
And here's the results :
result
I would like to just have the "1957-04-10" or the "10 avril 1957", whatever ! But I cannot figure it out ! I tried many things but it's the best I had so far.
Thanks :)
You can use .text property of <time> tag to get the time:
import requests
from bs4 import BeautifulSoup
url = 'https://www.senscritique.com/films/tops/top111'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for movie in soup.select('.elto-item'):
title = movie.select_one('[id^="product-title"]').text
time = movie.select_one('time')
time = time.text if time else '-'
print('{:<40} {}'.format(title, time))
Prints:
12 hommes en colère 10 avril 1957
Harakiri 16 septembre 1962
Barberousse 3 avril 1965
Le Bon, la Brute et le Truand 23 décembre 1966
Les Sept Samouraïs 26 avril 1954
Il était une fois dans l'Ouest 21 décembre 1968
Il était une fois en Amérique 23 mai 1984
Le Parrain 24 mars 1972
Le Trou 18 mars 1960
Dersou Ouzala 2 août 1975
Point limite 7 octobre 1964
Entre le ciel et l'enfer 1 mars 1963
...and so on.
I think something like this would do it for you, just returning the date.
tags = soup('time')
date_formatted = list()
for tag in tags:
date_formatted.append((tag.contents[0])))
print(date_formatted[0])
import requests
from bs4 import BeautifulSoup
URL = 'https://www.mohfw.gov.in/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
table_body = table.find_all('tbody')
print(table_body)
I want the tbody which is out of the comment. Every layer of tr and td have a span section and there are many layers of these.
Some content of tbody that you wish to grab from that page generate dynamically but you can find a link having json content if you look for it in dev tools. The data should all be there now
Try this:
import requests
URL = 'https://www.mohfw.gov.in/data/datanew.json'
page = requests.get(URL,headers={"x-requested-with":"XMLHttpRequest"})
for item in page.json():
sno = item['sno']
state_name = item['state_name']
active = item['active']
positive = item['positive']
cured = item['cured']
death = item['death']
new_active = item['new_active']
new_positive = item['new_positive']
new_cured = item['new_cured']
new_death = item['new_death']
state_code = item['state_code']
print(sno,state_name,active,positive,cured,death,new_active,new_positive,new_cured,new_death,state_code)
Output are like:
2 Andaman and Nicobar Islands 677 2945 2231 37 635 2985 2309 41 35
1 Andhra Pradesh 89932 371639 278247 3460 92208 382469 286720 3541 28
3 Arunachal Pradesh 899 3412 2508 5 987 3555 2563 5 12
4 Assam 19518 94592 74814 260 19535 96771 76962 274 18
5 Bihar 19716 124536 104301 519 19823 126714 106361 530 10
6 Chandigarh 1456 3209 1713 40 1539 3376 1796 41 04
I'm trying to scrape data into a CSV file from a website that lists contact information for people in my industry. My code works well until I get to a page where one of the entries doesn't have a specific item.
So for example:
I'm trying to collect
Name, Phone, Profile URL
If there isn't a phone number listed, there isn't even a tag for that field on the page, and my code errors out with
"IndexError: list index out of range"
I'm pretty new to this, but what I've managed to cobble together so far from various youtube tutorials/this site has really saved me a ton of time completing some tasks that would take me days otherwise. I'd appreciate any help that anyone is willing to offer.
I've tried varying if/then statements where if the variable is null, then set the variable to "Empty"
Edit:
I updated the code. I switched to CSS Selectors for more specificity and readability. I also added a try/except to at least bypass the index error, but doesn't solve the problem of incorrect data being stored due to uneven amounts of data for each field. Also, the site I'm trying to scrape is in the code now.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Firefox()
MAX_PAGE_NUM = 5
MAX_PAGE_DIG = 2
with open('results.csv', 'w') as f:
f.write("Name, Number, URL \n")
#Run Through Pages
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
website = "https://www.realtor.com/realestateagents/lansing_mi/pg-" + page_num
driver.get(website)
Name = driver.find_elements_by_css_selector('div.agent-list-card-title-text.clearfix > div.agent-name.text-bold > a')
Number = driver.find_elements_by_css_selector('div.agent-list-card-title-text.clearfix > div.agent-phone.hidden-xs.hidden-xxs')
URL = driver.find_elements_by_css_selector('div.agent-list-card-title-text.clearfix > div.agent-name.text-bold > a')
#Collect Data From Each Page
num_page_items = len(Name)
with open('results.csv', 'a') as f:
for i in range(num_page_items):
try:
f.write(Name[i].text.replace(",", ".") + "," + Number[i].text + "," + URL[i].get_attribute('href') + "\n")
print(Name[i].text.replace(",", ".") + "," + Number[i].text + "," + URL[i].get_attribute('href') + "\n")
except IndexError:
f.write("Skip, Skip, Skip \n")
print("Number Missing")
continue
driver.close()
If any of the fields I'm trying to collect don't exist on individual listings, I just want the empty field to be filled in as "Empty" on the spreadsheet.
You could use try/except to take care of that. I also opted to use Pandas and BeautifulSoup as I'm more familiar with those.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
import pandas as pd
MAX_PAGE_NUM = 5
MAX_PAGE_DIG = 2
results = pd.DataFrame()
#Run Through Pages
for i in range(1, MAX_PAGE_NUM + 1):
page_num = (MAX_PAGE_DIG - len(str(i))) * "0" + str(i)
website = "https://www.realtor.com/realestateagents/lansing_mi/pg-" + page_num
driver.get(website)
soup = BeautifulSoup(driver.page_source, 'html.parser')
agent_cards = soup.find_all('div', {'class':'agent-list-card clearfix'})
for agent in agent_cards:
try:
Name = agent.find('div', {'itemprop':'name'}).text.strip().split('\n')[0]
except:
Name = None
try:
Number = agent.find('div', {'itemprop':'telephone'}).text.strip()
except:
Number = None
try:
URL = 'https://www.realtor.com/' + agent.find('a', href=True)['href']
except:
URL = None
temp_df = pd.DataFrame([[Name, Number, URL]], columns=['Name','Number','URL'])
results = results.append(temp_df, sort=True).reset_index(drop=True)
print('Processed page: %s' %i)
driver.close()
results.to_csv('results.csv', index=False)
Output:
print (results)
Name ... URL
0 Nicole Enz ... https://www.realtor.com//realestateagents/nico...
1 Jennifer Worthington ... https://www.realtor.com//realestateagents/jenn...
2 Katherine Keener ... https://www.realtor.com//realestateagents/kath...
3 Erica Cook ... https://www.realtor.com//realestateagents/eric...
4 Jeff Thornton, Broker, Assoc Broker ... https://www.realtor.com//realestateagents/jeff...
5 Neal Sanford, Agent ... https://www.realtor.com//realestateagents/neal...
6 Sherree Zea ... https://www.realtor.com//realestateagents/sher...
7 Jennifer Cooper ... https://www.realtor.com//realestateagents/jenn...
8 Charlyn Cosgrove ... https://www.realtor.com//realestateagents/char...
9 Kathy Birchen & Chad Dutcher ... https://www.realtor.com//realestateagents/kath...
10 Nancy Petroff ... https://www.realtor.com//realestateagents/nanc...
11 The Angela Averill Team ... https://www.realtor.com//realestateagents/the-...
12 Christina Tamburino ... https://www.realtor.com//realestateagents/chri...
13 Rayce O'Connell ... https://www.realtor.com//realestateagents/rayc...
14 Stephanie Morey ... https://www.realtor.com//realestateagents/step...
15 Sean Gardner ... https://www.realtor.com//realestateagents/sean...
16 John Burg ... https://www.realtor.com//realestateagents/john...
17 Linda Ellsworth-Moore ... https://www.realtor.com//realestateagents/lind...
18 David Bueche ... https://www.realtor.com//realestateagents/davi...
19 David Ledebuhr ... https://www.realtor.com//realestateagents/davi...
20 Aaron Fox ... https://www.realtor.com//realestateagents/aaro...
21 Kristy Seibold ... https://www.realtor.com//realestateagents/kris...
22 Genia Beckman ... https://www.realtor.com//realestateagents/geni...
23 Angela Bolan ... https://www.realtor.com//realestateagents/ange...
24 Constance Benca ... https://www.realtor.com//realestateagents/cons...
25 Lisa Fata ... https://www.realtor.com//realestateagents/lisa...
26 Mike Dedman ... https://www.realtor.com//realestateagents/mike...
27 Jamie Masarik ... https://www.realtor.com//realestateagents/jami...
28 Amy Yaroch ... https://www.realtor.com//realestateagents/amy-...
29 Debbie McCarthy ... https://www.realtor.com//realestateagents/debb...
.. ... ... ...
70 Vickie Blattner ... https://www.realtor.com//realestateagents/vick...
71 Faith F Steller ... https://www.realtor.com//realestateagents/fait...
72 A. Jason Titus ... https://www.realtor.com//realestateagents/a.--...
73 Matt Bunn ... https://www.realtor.com//realestateagents/matt...
74 Joe Vitale ... https://www.realtor.com//realestateagents/joe-...
75 Reozom Real Estate ... https://www.realtor.com//realestateagents/reoz...
76 Shane Broyles ... https://www.realtor.com//realestateagents/shan...
77 Megan Doyle-Busque ... https://www.realtor.com//realestateagents/mega...
78 Linda Holmes ... https://www.realtor.com//realestateagents/lind...
79 Jeff Burke ... https://www.realtor.com//realestateagents/jeff...
80 Jim Convissor ... https://www.realtor.com//realestateagents/jim-...
81 Concetta D'Agostino ... https://www.realtor.com//realestateagents/conc...
82 Melanie McNamara ... https://www.realtor.com//realestateagents/mela...
83 Julie Adams ... https://www.realtor.com//realestateagents/juli...
84 Liz Horford ... https://www.realtor.com//realestateagents/liz-...
85 Miriam Olsen ... https://www.realtor.com//realestateagents/miri...
86 Wanda Williams ... https://www.realtor.com//realestateagents/wand...
87 Troy Seyfert ... https://www.realtor.com//realestateagents/troy...
88 Maggie Gerich ... https://www.realtor.com//realestateagents/magg...
89 Laura Farhat Bramson ... https://www.realtor.com//realestateagents/laur...
90 Peter MacIntyre ... https://www.realtor.com//realestateagents/pete...
91 Mark Jacobsen ... https://www.realtor.com//realestateagents/mark...
92 Deb Good ... https://www.realtor.com//realestateagents/deb-...
93 Mary Jane Vanderstow ... https://www.realtor.com//realestateagents/mary...
94 Ben Magsig ... https://www.realtor.com//realestateagents/ben-...
95 Brenna Chamberlain ... https://www.realtor.com//realestateagents/bren...
96 Deborah Cooper, CNS ... https://www.realtor.com//realestateagents/debo...
97 Huggler, Bashore & Brooks ... https://www.realtor.com//realestateagents/hugg...
98 Jodey Shepardson Custack ... https://www.realtor.com//realestateagents/jode...
99 Madaline Alspaugh-Young ... https://www.realtor.com//realestateagents/mada...
[100 rows x 3 columns]
I'm faced with the following challenge: I want to get all financial data about companies and I wrote a code that does it and let's say that the result is like below:
Unnamed: 0 I Q 2017 II Q 2017 \
0 Przychody netto ze sprzedaży (tys. zł) 137 134
1 Zysk (strata) z działal. oper. (tys. zł) -423 -358
2 Zysk (strata) brutto (tys. zł) -501 -280
3 Zysk (strata) netto (tys. zł)* -399 -263
4 Amortyzacja (tys. zł) 134 110
5 EBITDA (tys. zł) -289 -248
6 Aktywa (tys. zł) 27 845 26 530
7 Kapitał własny (tys. zł)* 22 852 22 589
8 Liczba akcji (tys. szt.) 13 921,975 13 921,975
9 Zysk na akcję (zł) -0029 -0019
10 Wartość księgowa na akcję (zł) 1641 1623
11 Raport zbadany przez audytora N N
but 464 times more.
Unfortunately when I want to save all 464 results in one CSV file I can save only one last result. Not all 464 results, just one... Could you help me save all? Below is my code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.bankier.pl/gielda/notowania/akcje'
page = requests.get(url)
soup = BeautifulSoup(page.content,'lxml')
# Find the second table on the page
t = soup.find_all('table')[0]
#Read the table into a Pandas DataFrame
df = pd.read_html(str(t))[0]
#get
names_of_company = df["Walor AD"].values
links_to_financial_date = []
#all linkt with the names of companies
links = []
for i in range(len(names_of_company)):
new_string = 'https://www.bankier.pl/gielda/notowania/akcje/' + names_of_company[i] + '/wyniki-finansowe'
links.append(new_string)
############################################################################
for i in links:
url2 = f'https://www.bankier.pl/gielda/notowania/akcje/{names_of_company[0]}/wyniki-finansowe'
page2 = requests.get(url2)
soup = BeautifulSoup(page2.content,'lxml')
# Find the second table on the page
t2 = soup.find_all('table')[0]
df2 = pd.read_html(str(t2))[0]
df2.to_csv('output.csv', index=False, header=None)
You've almost got it. You're just overwriting your CSV each time. Replace
df2.to_csv('output.csv', index=False, header=None)
with
with open('output.csv', 'a') as f:
df2.to_csv(f, header=False)
in order to append to the CSV instead of overwriting it.
Also, your example doesn't work because this:
for i in links:
url2 = f'https://www.bankier.pl/gielda/notowania/akcje/{names_of_company[0]}/wyniki-finansowe'
should be:
for i in links:
url2 = i
When the website has no data, skip and move on to the next one:
try:
t2 = soup.find_all('table')[0]
df2 = pd.read_html(str(t2))[0]
with open('output.csv', 'a') as f:
df2.to_csv(f, header=False)
except:
pass