How to write to a csv from from python - python-3.x

I am web scraping with python from pacsun.com and I am trying to put it into a csv file but when I open the file only the headers print and not the product_name, price, or the new_arrival.
So my question is how do I get these values to print out under the headers in a csv file?
from bs4 import BeautifulSoup as soup
import csv
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,'w')
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
#the code above gets
print(price[0].text)
print(new_arrival[0].text)
thewriter = csv.DictWriter(filename, headers)
thewriter.writerow({'product_name':product_name, 'price':price, 'new_arrival':new_arrival})
#f.write(product_name.replace(",", "|") + "," + price + ","+ new_arrival + "\n")
f.close()

You have a problem with the data. So, I fix it and it works fine. You only need to change w to a to be f = open(filename,'a') and to put f.write in the loop
from bs4 import BeautifulSoup as soup
import csv
from urllib.request import urlopen as uReq
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,"a")
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
price_ = ''
new_arrival_ = ''
product_name_ = ''
# product_name_ = ' '.join([str(elem) for elem in product.div.a["title"]])
for price_text in price:
price_ = price_text.text
for new_arrival_text in new_arrival:
new_arrival_ = new_arrival_text.text
f.write(product.div.a["title"]+","+price_+ "," + new_arrival_ + "\n")
f.close()

Related

Python - Problem with saving as '.csv' extension

Functioning code:
import requests
from bs4 import BeautifulSoup
import pyautogui
import csv
url = 'https://url~~'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
total = soup.select('.gall_title')
searchList = []
contentList = []
for i in total:
searchList.append("https://url~~" + i.attrs['href'])
for i in searchList:
res2 = requests.get(i)
html2 = res2.text
soup2 = BeautifulSoup(html2, 'html.parser')
content_h = soup2.select('h3 > span.title_subject')
contentList.append(content_h)
print(contentList)
#save csv
f = open(1.csv', 'w', encoding='utf-8', newline='')
csvWriter = csv.writer(f)
for i in contentList:
csvWriter.writerow(i)
f.close()
★Result★
print(contentList):
# [[<span class="title_subject">tomato</span>], [<span class="title_subject">apple</span>]]
Image:
enter image description here
Non-functioning code:
import requests
from bs4 import BeautifulSoup
import pyautogui
import csv
url = 'https://url~~'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
total = soup.select('.gall_title')
searchList = []
contentList = []
for i in total:
searchList.append("https://url~~" + i.attrs['href'])
for i in searchList:
res2 = requests.get(i)
html2 = res2.text
soup2 = BeautifulSoup(html2, 'html.parser')
content_h = str(soup2.select('h3 > span.title_subject')) // only changed
contentList.append(content_h)
print(contentList)
#save csv
f = open(1.csv', 'w', encoding='utf-8', newline='')
csvWriter = csv.writer(f)
for i in contentList:
csvWriter.writerow(i)
f.close()
★Result★
print(contentList):
['[tomato]', '[apple]']
Image:
enter image description here
How can I remove the issue where strings are being saved one character at a time in a '.csv' file?
soup2.select('h3 > span.title_subject') gives you the whole HTML tag. Extract its string value with .string:
strings = [x.string for x in soup2.select('h3 > span.title_subject')]

How can I get BeautifulSoup info in the same row?

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

By Beautiful Soup i scrape twitter data. I am able to get data but can't save in csv file

I scraped Twitter for user name, Tweets, replies, retweets but can't save in a CSV file.
Here is the code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "5_twitterBBC.csv"
f = open(file, "w")
Headers = "tweet_user, tweet_text, replies, retweets\n"
f.write(Headers)
for page in range(0,5):
url = "https://twitter.com/BBCWorld".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
tweets = soup.find_all("div", {"class":"js-stream-item"})
for tweet in tweets:
try:
if tweet.find('p',{"class":'tweet-text'}):
tweet_user = tweet.find('span',{"class":'username'}).text.strip()
tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
print(tweet_user, tweet_text, replies, retweets)
f.write("{}".format(tweet_user).replace(",","|")+ ",{}".format(tweet_text)+ ",{}".format( replies).replace(",", " ")+ ",{}".format(retweets) + "\n")
except: AttributeError
f.close()
I get data but can't save in CSV file. Someone explain me how to save data in CSV file.
As you can see, you've only made a small error in finding the tweets here tweets = soup.find_all("div", {"class":"js-stream-item"}), you forgot to pass on the argument key name which should be like this tweets = soup.find_all("div", attrs={"class":"js-stream-item"})
This is a working solution but it only fetches the first 20 tweets
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "5_twitterBBC.csv"
f = open(file, "w")
Headers = "tweet_user, tweet_text, replies, retweets\n"
f.write(Headers)
url = "https://twitter.com/BBCWorld"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# Gets the tweet
tweets = soup.find_all("li", attrs={"class":"js-stream-item"})
# Writes tweet fetched in file
for tweet in tweets:
try:
if tweet.find('p',{"class":'tweet-text'}):
tweet_user = tweet.find('span',{"class":'username'}).text.strip()
tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
# String interpolation technique
f.write(f'{tweet_user},/^{tweet_text}$/,{replies},{retweets}\n')
except: AttributeError
f.close()
filename = "output.csv"
f = open(filename, "w",encoding="utf-8")
headers = " tweet_user, tweet_text, replies, retweets \n"
f.write(headers)
***your code***
***loop****
f.write(''.join(tweet_user + [","] + tweet_text + [","] + replies + [","] + retweets + [","] + ["\n"]) )
f.close()

Pagination Webscraping Python3- BS4 - While loop

I finished my scraper for one page and extracted the href for the next page.
I can't get the scraper in a loop for each subsequent page. I tried a While True loop, but this kills my results from the first page.
This code works perfectly for the first page:
import bs4
from urllib.request import urlopen as ireq
from bs4 import BeautifulSoup as soup
myurl = ('https://www.podiuminfo.nl/concertagenda/')
uClient = ireq(myurl)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
filename = "db.csv"
f = open(filename, "w")
headers = "Artist, Venue, City, Date\n"
f.write(headers)
DayContainer = page_soup.findAll("section",{"class":"overflow"})
print("Days on page: " + str(len(DayContainer)) + "\n")
def NextPage():
np = page_soup.findAll("section", {"class":"next_news"})
np = np[0].find('a').attrs['href']
print(np)
for days in DayContainer:
shows = days.findAll("span", {"class":"concert_uitverkocht"})
for soldout in shows:
if shows:
soldoutPlu = shows[0].parent.parent.parent
artist = soldoutPlu.findAll("div", {"class":"td_2"})
artist = artist[0].text.strip()
venue = soldoutPlu.findAll("div", {"class":"td_3"})
venue = venue[0].text
city = soldoutPlu.findAll("div", {"class":"td_4"})
city = city[0].text
date = shows[0].parent.parent.parent.parent.parent
date = date.findAll("section", {"class":"concert_agenda_date"})
date = date[0].text
date = date.strip().replace("\n", " ")
print("Datum gevonden!")
print("Artiest: " + artist)
print("Locatie: " + venue)
print("Stad: " + city)
print("Datum: " + date+ "\n")
f.write(artist + "," + date + "," + city + "," + venue + "\n")
else:
pass
NextPage()
No need for a baseurl + number method I suppose, because I can extract the correct url from each page using findAll. I'm fairly new so the mistake must be pretty dumb.
Thanks for helping out!
Try the below script to get the required fields traversing different pages and write them accordingly to a csv file. I tried to clean up your repetitive coding and applied slightly cleaner approach in place of that. Give it a go:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
link = 'https://www.podiuminfo.nl/concertagenda/?page={}&input_plaats=&input_datum=2018-06-30&input_podium=&input_genre=&input_provincie=&sort=&input_zoek='
with open("output.csv","w",newline="",encoding="utf-8") as infile:
writer = csv.writer(infile)
writer.writerow(['Artist','Venue','City'])
pagenum = -1 #make sure to get the content of the first page as well which is "0" in the link
while True:
pagenum+=1
res = urlopen(link.format(pagenum)).read()
soup = BeautifulSoup(res, "html.parser")
container = soup.find_all("section",class_="concert_rows_info")
if len(container)<=1:break ##as soon as there is no content the scraper should break out of the loop
for items in container:
artist = items.find(class_="td_2")("a")[0].get_text(strip=True)
venue = items.find(class_="td_3").get_text(strip=True)
city = items.find(class_="td_4").get_text(strip=True)
writer.writerow([artist,city,venue])
print(f'{artist}\n{venue}\n{city}\n')
your mistakes
you have to fetch the url that you found in the end of your file you are just calling NextPage() but what is it doing is just printing out the url
that was your mistake :)
import bs4
from urllib.request import urlopen as ireq
from bs4 import BeautifulSoup as soup
filename = "db.csv"
#at the beginning of the document you create the file in 'w'-write mode
#but later you should open it in "A"-append mode because 'W'-write will rewrite the file
f = open(filename, "w")
headers = "Artist, Venue, City, Date\n"
f.write(headers)
f.close()
#create a function url_fetcher that everytime will go and fetch the html
def url_fetcher(url):
myurl = (url)
uClient = ireq(myurl)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
DayContainer = page_soup.findAll("section",{"class":"overflow"})
print("Days on page: " + str(len(DayContainer)) + "\n")
get_artist(DayContainer, page_soup)
#here you have to call the url otherwize it wont work
def NextPage(page_soup):
np = page_soup.findAll("section", {"class":"next_news"})
np = np[0].find('a').attrs['href']
url_fetcher(np)
#in get artist you have some repeatings but you can tweak alittle bit and it will work
def get_artist(DayContainer, page_soup):
for days in DayContainer:
shows = days.findAll("span", {"class":"concert_uitverkocht"})
for soldout in shows:
print(soldout)
if shows:
soldoutPlu = shows[0].parent.parent.parent
artist = soldoutPlu.findAll("div", {"class":"td_2"})
artist = artist[0].text.strip()
venue = soldoutPlu.findAll("div", {"class":"td_3"})
venue = venue[0].text
city = soldoutPlu.findAll("div", {"class":"td_4"})
city = city[0].text
date = shows[0].parent.parent.parent.parent.parent
date = date.findAll("section", {"class":"concert_agenda_date"})
date = date[0].text
date = date.strip().replace("\n", " ")
print("Datum gevonden!")
print("Artiest: " + artist)
print("Locatie: " + venue)
print("Stad: " + city)
print("Datum: " + date+ "\n")
with open(filename, "a") as f:
f.write(artist + "," + date + "," + city + "," + venue + "\n")
else:
pass
NextPage(page_soup)
url_fetcher('https://www.podiuminfo.nl/concertagenda/')
recap
for easier understanding i've made a big a loop but it works :)
you need to make some ajustments of the so there are not repetitive names and dates in db.csv

'list' object has no attribute 'timeout' and only prints first item in the table

I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)

Resources