CSV writer writes set to a single row rather than multiple rows - python-3.x

I am working on a web scraper for class. I basically have to compile all of the http links from a website and write them to a csv. They also need to be de-duplicated which is why I'm using a set. I have all the parts complete expect when it writes to the csv, the entire set of links writes to a single row rather than one link per row. Can someone review my code and tell me what i'm missing? I cannot find a solution anywhere.
My code is below:
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = set()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.add(abs_url)
with file:
write = csv.writer(file, delimiter = ',', lineterminator = '\r')
write.writerow(['List of Links'])
write.writerows([l])
file.close()
This is a printout of what's happening:
CSV Image

from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = list()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.append(abs_url)
with file:
write = csv.writer(file)
write.writerow(['List of Links'])
for x in l:
write.writerow([x])
file.close()

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

Scraping data and putting it in different columns using BeautifulSoup

I have written a script to scrape data from a website. It has 2 columns. But I want to add another column to it (abstract column). How can I do this inside the same loop? I need to get the 'abstract' data in the third column. Image attached below.
The code is below:
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https:/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
for title in soup.select('div.title > h1'):
csvriter.writerow([title.strip() for title in
title.text.split(':')]);
According to your description, I guess abstract and category, vulnerability maybe have the common father div element.
Then I try to find the common div and extract data in every loop, finally, I verified my guess, I also add default value for vulnerability when title has no vulnerability content
The following code run successfully
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https://vulncat.fortify.com/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
# find the common father div info
all_father_info = soup.find_all("div", class_="detailcell weaknessCell panel")
for father in all_father_info:
# find the son div info, then extract data
son_info_12 = father.find('h1').text.split(":")
if len(son_info_12) == 2:
category, vulnerability = son_info_12[0].strip(), son_info_12[1].strip()
elif len(son_info_12) == 1:
category = son_info_12[0].strip()
vulnerability = ""
else:
category, vulnerability = "", ""
# find the son div info, then extract abstract
abstract = father.find("div", class_="t").text.strip()
# write data into csv file
csvriter.writerow([category, vulnerability, abstract])

How Can I Loop Through URLs and import TD elements from several Links

I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.

Can't append Base URL to create absolute links with Beatifulsoup Python 3

I get a list of links in the output file but need all of the links to show as absolute links. Some are absolute and others are relative. How do I append the base url to the relatives to ensure that I get only absolute links in the csv output?
I get back all the links but not all are absolute links e.g /subpage instead of http://page.com/subpage
from bs4 import BeautifulSoup
import requests
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
#only return links to subpages e.g. a tag that contains href
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("file.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
content = open('file.csv', 'r').readlines()
content_set = set(content)
cleandata = open('file.csv', 'w')
for line in content_set:
cleandata.write(line)
with urljoin:
from urlparse import urljoin
...
base_url = "http://cnn.com"
absolute_url = urljoin(base_url, relative_url)

BeautifulSoup Absoute URLs Print to CSV

I've been going through tons of threads on here to see if I can find a way to fix this code but cant quite seem to get this to work. I'm trying to scrape links from a site then write to csv. Here's the code:
I found a way to get 95% of the way there but am missing something for getting just the href:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import csv
j = urllib.request.urlopen("http://cnn.com")
soup = BeautifulSoup(j, "lxml")
data = soup.find_all('a', href=True)
for url in soup.find_all('a', href=True):
#print(url.get('href'))
with open('marcel.csv', 'w', newline='') as csvfile:
write = csv.writer(csvfile)
write.writerows(data)
Here is probably what you want to do.
from bs4 import BeautifulSoup
import requests #better than urllib
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("marcel.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
I use openpyxl to get it
from openpyxl import Workbook,load_workbook
I think it is very easy.
it is a part of my project,you could try it
def createExcel(self):
wb = Workbook(optimized_write=True)
ws = wb.create_sheet(title='书籍列表')
row0 = ['编号','条码号','题名','责任者','借阅日期','归还日期','馆藏地']
ws.append(row0)
save_path = 'book_hist.xlsx'
wb.save(save_path)
def saveToExcel(self,data_list):
wb = load_workbook(filename='book_hist.xlsx')
ws = wb.get_sheet_by_name('书籍列表')
for i in range(len(data_list)):
ws.append(data_list[i])
save_path = 'book_hist.xlsx'
wb.save(save_path)

Resources