I have used one of the methods described here Python write to CSV line by line to attempt to write all the lines of my output to a .CSV. I've managed to get it to the stage of outputting and generating the CSV, but instead of showing all the lines of my data I am seeing one line, repeated 4 times and nothing else.
Can anyone see what the issue is here?
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
print(sheets)
with open('csvfile.csv','wt') as file:
for l in sheets:
file.writelines(sheets)
file.write('\n')
You probably want something more like the following untested code. The example provided can't be run as is:
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
# Open the file once. See the csv documentation for the correct way to open
# a file for use with csv.writer. If you plan to open the .csv with
# Excel, the utf-8-sig encoding will allow non-ASCII to work correctly.
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f) # actually use the CSV module.
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
# write a single line.
file.writerow(sheets)
Here's a tested example that will open in Excel. I threw in a non-ASCII character and a comma in the data to demonstrate the csv module's ability to handle it:
#coding:utf8
import csv
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f)
file.writerow('BRAND PRODUCT STOCK PRICE'.split())
for i in range(1,11):
sheets = ['brand{}'.format(i),'pröduct{}'.format(i),'st,ock{}'.format(i),'price{}'.format(i)]
file.writerow(sheets)
Output:
BRAND,PRODUCT,STOCK,PRICE
brand1,pröduct1,"st,ock1",price1
brand2,pröduct2,"st,ock2",price2
brand3,pröduct3,"st,ock3",price3
brand4,pröduct4,"st,ock4",price4
brand5,pröduct5,"st,ock5",price5
brand6,pröduct6,"st,ock6",price6
brand7,pröduct7,"st,ock7",price7
brand8,pröduct8,"st,ock8",price8
brand9,pröduct9,"st,ock9",price9
brand10,pröduct10,"st,ock10",price10
In Excel:
Related
recently started getting into web scraping and i have managed ok but now im stuck and i cant find the answer or figure it out.
Here is my code for scraping and exporting info from a single page
import requests
page = requests.get("https://www.example.com/page.aspx?sign=1")
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
print (heading, reading)
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([heading, reading, datetime.now()])
Problem occurs when i try to scrape multiple pages at the same time.
They are all the same just pagination changes eg
https://www.example.com/page.aspx?sign=1
https://www.example.com/page.aspx?sign=2
https://www.example.com/page.aspx?sign=3
https://www.example.com/page.aspx?sign=4 etc
Instead of writing the same code 20 times how do i stick all the data in a tuple or an array and export to csv.
Many thanks in advance.
Just try it out with a loop, until you got no page available (request is not OK). Should be easy to get.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
while True:
response = requests.get(f"https://www.example.com/page.aspx?sign={page_number}")
if response.status_code != 200:
break
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
# write a list
# results.append([heading, reading, datetime.now()])
# or tuple.. your call
results.append((heading, reading, datetime.now()))
page_number = page_number + 1
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for result in results:
writer.writerow(result)
I have tried reading a table from a website. It can be seen from my code that I have gone too far to get the table, I would appreciate if someone give me an opportunity to learn a quick method to do the same.
Here's my code:
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=9606&aa=1&style=N"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
text = soup.get_text()
with open('myfile.txt', 'w') as file:
file.writelines(text)
with open('myfile.txt','r') as g:
f = g.readlines()
tab = f[12:31]
table = [x.strip() for x in tab]
Every time running the code messes up with writing and reading the file.
You shouldn't need files. Filter for the pre tag instead, to target the table alone.
soup = BeautifulSoup(html)
text=soup.find('pre')
table = [x.strip() for x in text]
I am trying to save data that I have already scraped from the Yew York times web page, to a txt file.
import urllib.request
from bs4 import BeautifulSoup
# URL
html_page = 'https://www.nytimes.com/'
page = urllib.request.urlopen(html_page)
soup = BeautifulSoup(page, "html.parser")
title_box = soup.findAll("h2", class_= "css-bzeb53 esl82me2")
print(title_box)
# Extract titles from list
titles = []
for occurence in title_box:
titles.append(occurence.text.strip())
print(titles)
works fine up to this point but i cant manage to create/save to data to a txt file.
# Save the Headlines
filename = '/home/stephan/Documents/NYHeads.txt'
with open(filename, 'w') as file_object:
file_object.write(titles)
The problem is when you are trying to write to file, it has to be a string. In your program titles is a list. You need to convert titles to string. This should work:
filename = '/home/stephan/Documents/NYHeads.txt'
with open(filename, 'w') as file_object:
file_object.write(str(titles))
I have a batch of list of url, and I want to crawl some information on these url
daa = ['https://old.reddit.com/r/Games/comments/a2p1ew/', 'https://old.reddit.com/r/Games/comments/9zzo0e/', 'https://old.reddit.com/r/Games/comments/a31a6q/', ]
for y in daa:
uClient = requests.get(y, headers = {'User-agent': 'your bot 0.1'})
page_soup = soup(uClient.content, "html.parser")
time= page_soup.findAll("p", {"class":"tagline"})[0].time.get('datetime').replace('-', '')
And I works well to get all time I want. But I need to do it without for loop or I mean I need to open and write a file at next step but if I do it in the same loop, the output is weird.
How do I get time without a for loop?
you could do as stated above use the open(file, 'a'). Or what I like to do is append everything into a table, and then write the whole thing as a file.
import requests
import bs4
import pandas as pd
results = pd.DataFrame()
daa = ['https://old.reddit.com/r/Games/comments/a2p1ew/', 'https://old.reddit.com/r/Games/comments/9zzo0e/', 'https://old.reddit.com/r/Games/comments/a31a6q/', ]
for y in daa:
w=1
uClient = requests.get(y, headers = {'User-agent': 'your bot 0.1'})
page_soup = bs4.BeautifulSoup(uClient.content, "html.parser")
time= page_soup.findAll("p", {"class":"tagline"})[0].time.get('datetime').replace('-', '')
temp_df = pd.DataFrame([[y, time]], columns=['url','time'])
results = results.append(temp_df).reset_index(drop = True)
result.to_csv('path/to_file.csv', index=False)
I have written a script to scrape data from a website. It has 2 columns. But I want to add another column to it (abstract column). How can I do this inside the same loop? I need to get the 'abstract' data in the third column. Image attached below.
The code is below:
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https:/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
for title in soup.select('div.title > h1'):
csvriter.writerow([title.strip() for title in
title.text.split(':')]);
According to your description, I guess abstract and category, vulnerability maybe have the common father div element.
Then I try to find the common div and extract data in every loop, finally, I verified my guess, I also add default value for vulnerability when title has no vulnerability content
The following code run successfully
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https://vulncat.fortify.com/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
# find the common father div info
all_father_info = soup.find_all("div", class_="detailcell weaknessCell panel")
for father in all_father_info:
# find the son div info, then extract data
son_info_12 = father.find('h1').text.split(":")
if len(son_info_12) == 2:
category, vulnerability = son_info_12[0].strip(), son_info_12[1].strip()
elif len(son_info_12) == 1:
category = son_info_12[0].strip()
vulnerability = ""
else:
category, vulnerability = "", ""
# find the son div info, then extract abstract
abstract = father.find("div", class_="t").text.strip()
# write data into csv file
csvriter.writerow([category, vulnerability, abstract])