Scraping data and putting it in different columns using BeautifulSoup - python-3.x

I have written a script to scrape data from a website. It has 2 columns. But I want to add another column to it (abstract column). How can I do this inside the same loop? I need to get the 'abstract' data in the third column. Image attached below.
The code is below:
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https:/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
for title in soup.select('div.title > h1'):
csvriter.writerow([title.strip() for title in
title.text.split(':')]);

According to your description, I guess abstract and category, vulnerability maybe have the common father div element.
Then I try to find the common div and extract data in every loop, finally, I verified my guess, I also add default value for vulnerability when title has no vulnerability content
The following code run successfully
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https://vulncat.fortify.com/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
# find the common father div info
all_father_info = soup.find_all("div", class_="detailcell weaknessCell panel")
for father in all_father_info:
# find the son div info, then extract data
son_info_12 = father.find('h1').text.split(":")
if len(son_info_12) == 2:
category, vulnerability = son_info_12[0].strip(), son_info_12[1].strip()
elif len(son_info_12) == 1:
category = son_info_12[0].strip()
vulnerability = ""
else:
category, vulnerability = "", ""
# find the son div info, then extract abstract
abstract = father.find("div", class_="t").text.strip()
# write data into csv file
csvriter.writerow([category, vulnerability, abstract])

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

How to scrape multiple pages with requests in python

recently started getting into web scraping and i have managed ok but now im stuck and i cant find the answer or figure it out.
Here is my code for scraping and exporting info from a single page
import requests
page = requests.get("https://www.example.com/page.aspx?sign=1")
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
print (heading, reading)
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([heading, reading, datetime.now()])
Problem occurs when i try to scrape multiple pages at the same time.
They are all the same just pagination changes eg
https://www.example.com/page.aspx?sign=1
https://www.example.com/page.aspx?sign=2
https://www.example.com/page.aspx?sign=3
https://www.example.com/page.aspx?sign=4 etc
Instead of writing the same code 20 times how do i stick all the data in a tuple or an array and export to csv.
Many thanks in advance.
Just try it out with a loop, until you got no page available (request is not OK). Should be easy to get.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
while True:
response = requests.get(f"https://www.example.com/page.aspx?sign={page_number}")
if response.status_code != 200:
break
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
# write a list
# results.append([heading, reading, datetime.now()])
# or tuple.. your call
results.append((heading, reading, datetime.now()))
page_number = page_number + 1
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for result in results:
writer.writerow(result)

CSV writer writes set to a single row rather than multiple rows

I am working on a web scraper for class. I basically have to compile all of the http links from a website and write them to a csv. They also need to be de-duplicated which is why I'm using a set. I have all the parts complete expect when it writes to the csv, the entire set of links writes to a single row rather than one link per row. Can someone review my code and tell me what i'm missing? I cannot find a solution anywhere.
My code is below:
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = set()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.add(abs_url)
with file:
write = csv.writer(file, delimiter = ',', lineterminator = '\r')
write.writerow(['List of Links'])
write.writerows([l])
file.close()
This is a printout of what's happening:
CSV Image
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = list()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.append(abs_url)
with file:
write = csv.writer(file)
write.writerow(['List of Links'])
for x in l:
write.writerow([x])
file.close()

Python CSV exporting 4 identical rows

I have used one of the methods described here Python write to CSV line by line to attempt to write all the lines of my output to a .CSV. I've managed to get it to the stage of outputting and generating the CSV, but instead of showing all the lines of my data I am seeing one line, repeated 4 times and nothing else.
Can anyone see what the issue is here?
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
print(sheets)
with open('csvfile.csv','wt') as file:
for l in sheets:
file.writelines(sheets)
file.write('\n')
You probably want something more like the following untested code. The example provided can't be run as is:
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
# Open the file once. See the csv documentation for the correct way to open
# a file for use with csv.writer. If you plan to open the .csv with
# Excel, the utf-8-sig encoding will allow non-ASCII to work correctly.
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f) # actually use the CSV module.
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
# write a single line.
file.writerow(sheets)
Here's a tested example that will open in Excel. I threw in a non-ASCII character and a comma in the data to demonstrate the csv module's ability to handle it:
#coding:utf8
import csv
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f)
file.writerow('BRAND PRODUCT STOCK PRICE'.split())
for i in range(1,11):
sheets = ['brand{}'.format(i),'pröduct{}'.format(i),'st,ock{}'.format(i),'price{}'.format(i)]
file.writerow(sheets)
Output:
BRAND,PRODUCT,STOCK,PRICE
brand1,pröduct1,"st,ock1",price1
brand2,pröduct2,"st,ock2",price2
brand3,pröduct3,"st,ock3",price3
brand4,pröduct4,"st,ock4",price4
brand5,pröduct5,"st,ock5",price5
brand6,pröduct6,"st,ock6",price6
brand7,pröduct7,"st,ock7",price7
brand8,pröduct8,"st,ock8",price8
brand9,pröduct9,"st,ock9",price9
brand10,pröduct10,"st,ock10",price10
In Excel:

the Accessing commented HTML Lines with BeautifulSoup

I am attempting to webscrape stats from this specific webpage: https://www.sports-reference.com/cfb/schools/louisville/2016/gamelog/
However, the table for the 'Defensive Game Log' appears to be commented out when I look at the HTML source (starts with <...!-- and ends with -->)
Because of this, when attempting to use BeautifulSoup4 the following code only grabs the offensive data that is not commented out while the defensive data is commented out.
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
import re
accessurl = 'https://www.sports-reference.com/cfb/schools/oklahoma-state/2016/gamelog/'
req = Request(accessurl)
link = urlopen(req)
soup = BeautifulSoup(link.read(), "lxml")
tables = soup.find_all(['th', 'tr'])
my_table = tables[0]
rows = my_table.findChildren(['tr'])
for row in rows:
cells = row.findChildren('td')
for cell in cells:
value = cell.string
print(value)
I am curious if there are any solutions to be able to add all of the defensive values into a list the same way the offensive data is stored be it inside or outside of BeautifulSoup4. Thanks!
Note that I added onto solution given below derived from here:
data = []
table = defensive_log
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
Comment object will give you what you want:
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup, Comment
accessurl = 'https://www.sports-reference.com/cfb/schools/oklahoma-state/2016/gamelog/'
req = Request(accessurl)
link = urlopen(req)
soup = BeautifulSoup(link, "lxml")
comments=soup.find_all(string=lambda text:isinstance(text,Comment))
for comment in comments:
comment=BeautifulSoup(str(comment), 'lxml')
defensive_log = comment.find('table') #search as ordinary tag
if defensive_log:
break

Resources