Python - Problem with saving as '.csv' extension - python-3.x

Functioning code:
import requests
from bs4 import BeautifulSoup
import pyautogui
import csv
url = 'https://url~~'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
total = soup.select('.gall_title')
searchList = []
contentList = []
for i in total:
searchList.append("https://url~~" + i.attrs['href'])
for i in searchList:
res2 = requests.get(i)
html2 = res2.text
soup2 = BeautifulSoup(html2, 'html.parser')
content_h = soup2.select('h3 > span.title_subject')
contentList.append(content_h)
print(contentList)
#save csv
f = open(1.csv', 'w', encoding='utf-8', newline='')
csvWriter = csv.writer(f)
for i in contentList:
csvWriter.writerow(i)
f.close()
★Result★
print(contentList):
# [[<span class="title_subject">tomato</span>], [<span class="title_subject">apple</span>]]
Image:
enter image description here
Non-functioning code:
import requests
from bs4 import BeautifulSoup
import pyautogui
import csv
url = 'https://url~~'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
total = soup.select('.gall_title')
searchList = []
contentList = []
for i in total:
searchList.append("https://url~~" + i.attrs['href'])
for i in searchList:
res2 = requests.get(i)
html2 = res2.text
soup2 = BeautifulSoup(html2, 'html.parser')
content_h = str(soup2.select('h3 > span.title_subject')) // only changed
contentList.append(content_h)
print(contentList)
#save csv
f = open(1.csv', 'w', encoding='utf-8', newline='')
csvWriter = csv.writer(f)
for i in contentList:
csvWriter.writerow(i)
f.close()
★Result★
print(contentList):
['[tomato]', '[apple]']
Image:
enter image description here
How can I remove the issue where strings are being saved one character at a time in a '.csv' file?

soup2.select('h3 > span.title_subject') gives you the whole HTML tag. Extract its string value with .string:
strings = [x.string for x in soup2.select('h3 > span.title_subject')]

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

How can I get BeautifulSoup info in the same row?

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

How to write to a csv from from python

I am web scraping with python from pacsun.com and I am trying to put it into a csv file but when I open the file only the headers print and not the product_name, price, or the new_arrival.
So my question is how do I get these values to print out under the headers in a csv file?
from bs4 import BeautifulSoup as soup
import csv
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,'w')
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
#the code above gets
print(price[0].text)
print(new_arrival[0].text)
thewriter = csv.DictWriter(filename, headers)
thewriter.writerow({'product_name':product_name, 'price':price, 'new_arrival':new_arrival})
#f.write(product_name.replace(",", "|") + "," + price + ","+ new_arrival + "\n")
f.close()
You have a problem with the data. So, I fix it and it works fine. You only need to change w to a to be f = open(filename,'a') and to put f.write in the loop
from bs4 import BeautifulSoup as soup
import csv
from urllib.request import urlopen as uReq
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,"a")
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
price_ = ''
new_arrival_ = ''
product_name_ = ''
# product_name_ = ' '.join([str(elem) for elem in product.div.a["title"]])
for price_text in price:
price_ = price_text.text
for new_arrival_text in new_arrival:
new_arrival_ = new_arrival_text.text
f.write(product.div.a["title"]+","+price_+ "," + new_arrival_ + "\n")
f.close()

'list' object has no attribute 'timeout' and only prints first item in the table

I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)

How to return href link which are not starting "\listing" using python

I am trying to scrape the links in https://www.panpages.my/search_results?q=
I have written Python script to get all links in each pages
I want to filter the links which is not starting as "\Listings"
Please find my script below and help me:
import requests
from bs4 import BeautifulSoup
import re
from io import StringIO
import csv
data = open("D:/Mine/Python/Projects/Freelancer/seekProgramming/rootpages.csv").read()
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
f = open('paylinks.csv', 'w', newline = '')
writer = csv.writer(f)
for row in csvReader:
myurl = row[0]
def simple_web_scrapper(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for root in soup.findAll('div', {"class": "mid_section col-xs-10 col-sm-7 tmargin xs-nomargin"}):
for link in root.findAll('a'):
href = link.get('href')
print(href)
simple_web_scrapper(myurl)
for row in csvReader:
myurl = row[0]
def simple_web_scrapper(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for root in soup.findAll('div', {"class": "mid_section col-xs-10 col-sm-7 tmargin xs-nomargin"}):
for link in root.findAll('a'):
href = link.get('href')
if href.startswith('\listings'): #that's the row you need
print(href)
simple_web_scrapper(myurl)

Resources