BeautifulSoup Absoute URLs Print to CSV - python-3.x

I've been going through tons of threads on here to see if I can find a way to fix this code but cant quite seem to get this to work. I'm trying to scrape links from a site then write to csv. Here's the code:
I found a way to get 95% of the way there but am missing something for getting just the href:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import csv
j = urllib.request.urlopen("http://cnn.com")
soup = BeautifulSoup(j, "lxml")
data = soup.find_all('a', href=True)
for url in soup.find_all('a', href=True):
#print(url.get('href'))
with open('marcel.csv', 'w', newline='') as csvfile:
write = csv.writer(csvfile)
write.writerows(data)

Here is probably what you want to do.
from bs4 import BeautifulSoup
import requests #better than urllib
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("marcel.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)

I use openpyxl to get it
from openpyxl import Workbook,load_workbook
I think it is very easy.
it is a part of my project,you could try it
def createExcel(self):
wb = Workbook(optimized_write=True)
ws = wb.create_sheet(title='书籍列表')
row0 = ['编号','条码号','题名','责任者','借阅日期','归还日期','馆藏地']
ws.append(row0)
save_path = 'book_hist.xlsx'
wb.save(save_path)
def saveToExcel(self,data_list):
wb = load_workbook(filename='book_hist.xlsx')
ws = wb.get_sheet_by_name('书籍列表')
for i in range(len(data_list)):
ws.append(data_list[i])
save_path = 'book_hist.xlsx'
wb.save(save_path)

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

CSV writer writes set to a single row rather than multiple rows

I am working on a web scraper for class. I basically have to compile all of the http links from a website and write them to a csv. They also need to be de-duplicated which is why I'm using a set. I have all the parts complete expect when it writes to the csv, the entire set of links writes to a single row rather than one link per row. Can someone review my code and tell me what i'm missing? I cannot find a solution anywhere.
My code is below:
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = set()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.add(abs_url)
with file:
write = csv.writer(file, delimiter = ',', lineterminator = '\r')
write.writerow(['List of Links'])
write.writerows([l])
file.close()
This is a printout of what's happening:
CSV Image
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = list()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.append(abs_url)
with file:
write = csv.writer(file)
write.writerow(['List of Links'])
for x in l:
write.writerow([x])
file.close()

Bs4 scrape table specifik

I hope you guys are always healthy
I want to scrape a more specific table using BS4. this is my code:
from bs4 import BeautifulSoup
import requests
url = 'test.com'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
for row in soup.select('tbody tr'):
row_text = [x.text for x in row.find_all('td')]
print (row_text)
how do you get results like this:
Number, Name, address, telp, komoditi
1, "ABON JUARA" JUARA FOOD INDUSTRY, Jl. Jend Sudirman 339, Salatiga, Jawa Tengah, 0298-324060, Abon Sapi Dan Ayam
and saved in CSV
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select_one("table#newspaper-a").select("tr[valign=top]")
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["No", "Name", "Address", "Tel", "Komoditi"])
for item in target:
item = list(item.stripped_strings)
item[3] = item[3][6:]
writer.writerow(item)
main("https://kemenperin.go.id/direktori-perusahaan?what=&prov=&hal=1")
Output: view-online

Get the name of Instagram profile and the date of post with Python

I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')

How Can I Loop Through URLs and import TD elements from several Links

I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.

Resources