Function for these parse elements will do not repeat. BeautifulSoup - python-3.x

Which function(or etc) is ideal so that these nicknames do not repeat on my parser. Dont know how to do that. I'l be very grateful if you help me.
Source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")

You can add all the names to a set.
A set object is an unordered collection of distinct hashable objects. Common uses include membership testing, removing duplicates from a sequence, and computing mathematical operations such as intersection, union, difference, and symmetric difference.
my_set = set()
# Lets add some elements to a set
my_set.add('a')
my_set.add('b')
print(my_set) # prints {'a', 'b'}
# Add one more 'a'
my_set.add('a')
print(my_set) # still prints {'a', 'b'} !
In your case, let's add all the names to a set and then write to the file after the for loop.
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
names = set()
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
names.add(playerName)
for name in names:
f.write(name + "\n")
f.close()
EDIT
Sets do not preserve the order. If you want to retain the order, just use lists.
...
names = []
while True:
...
for container in containers:
playerName = container.div.a.text.strip()
if playerName not in names:
names.append(playerName)
for name in names:
f.write(name + "\n")
f.close()

Related

How to write to a csv from from python

I am web scraping with python from pacsun.com and I am trying to put it into a csv file but when I open the file only the headers print and not the product_name, price, or the new_arrival.
So my question is how do I get these values to print out under the headers in a csv file?
from bs4 import BeautifulSoup as soup
import csv
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,'w')
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
#the code above gets
print(price[0].text)
print(new_arrival[0].text)
thewriter = csv.DictWriter(filename, headers)
thewriter.writerow({'product_name':product_name, 'price':price, 'new_arrival':new_arrival})
#f.write(product_name.replace(",", "|") + "," + price + ","+ new_arrival + "\n")
f.close()
You have a problem with the data. So, I fix it and it works fine. You only need to change w to a to be f = open(filename,'a') and to put f.write in the loop
from bs4 import BeautifulSoup as soup
import csv
from urllib.request import urlopen as uReq
my_url = ('https://www.pacsun.com/mens/')
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
product_data = page_soup.findAll('div',{'class':'product-data'})
#print(len(product_data))
#print(product_data[0])
product = product_data[0]
filename = 'pacsun.csv'
f = open(filename,"a")
headers = 'product_name, price, new_arrival\n'
f.write(headers)
for product in product_data:
#name = product.div.a["title"]
product_name = print('product: ' + product.div.a["title"])
#the code above gets the title of the product
price = product.findAll('div',{'class':'product-price group'})
#the code above gets the price of the product
new_arrival = product.findAll('div',{'class':'new'})
price_ = ''
new_arrival_ = ''
product_name_ = ''
# product_name_ = ' '.join([str(elem) for elem in product.div.a["title"]])
for price_text in price:
price_ = price_text.text
for new_arrival_text in new_arrival:
new_arrival_ = new_arrival_text.text
f.write(product.div.a["title"]+","+price_+ "," + new_arrival_ + "\n")
f.close()

Web scraping multiple pages with BeautifulSoup

I collected all the necessary information from the first page, but don’t know how to collect information from all pages of the site. I try to find my solution in other stackoverflow topics but didn't understand anything. I will be very grateful if you help me with this.
my parsing site: https://jaze.ru/forum/topic?id=50&page=1
source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# my_url and cutoff mod_security
my_url = Request('http://jaze.ru/forum/topic?id=50&page=1', headers={'User-Agent': 'Mozilla/5.0'})
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class":"top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
source2
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# start page
i = 1
while True:
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
# save all info to csv file
filename = "BattlePassNicknames.csv"
f = open(filename, "w", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
f.write(headers1)
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")
f.close()
The website redirects you to a different page if the page query parameter is greater than the last available page, you can use this to increment page till you are redirected. This is applicable if you already know the topic id (50 in this case).
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# start page
i = 1
while True:
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
Output
BattlePass PlayerName: VANTY3
BattlePass PlayerName: VANTY3
BattlePass PlayerName: KK#キング
BattlePass PlayerName: memories
BattlePass PlayerName: Waffel
BattlePass PlayerName: CynoBap
...
BattlePass PlayerName: Switchback
If you also want to try this out with random topic ids then you have to handle urllib.error.HTTPError somewhere in you code to deal with all the 404s etc..

'list' object has no attribute 'timeout' and only prints first item in the table

I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)
You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)

Python Web Scraping using BS

I have a web scraping program that gets multiple pages, but I have to set the while loop to a number. I want to make a condition that stops the loop once it reaches the last page or recognizes there are no more items to scrape. Assume I don't know how many pages exist. How do I change the while loop condition to make it stop without putting a random number?
import requests
from bs4 import BeautifulSoup
import csv
filename="output.csv"
f=open(filename, 'w', newline="",encoding='utf-8')
headers="Date, Location, Title, Price\n"
f.write(headers)
i=0
while i<5000:
if i==0:
page_link="https://portland.craigslist.org/search/sss?query=xbox&sort=date"
else:
page_link="https://portland.craigslist.org/search/sss?s={}&query=xbox&sort=date".format(i)
res=requests.get(page_link)
soup=BeautifulSoup(res.text,'html.parser')
for container in soup.select('.result-info'):
date=container.select('.result-date')[0].text
try:
location=container.select('.result-hood')[0].text
except:
try:
location=container.select('.nearby')[0].text
except:
location='NULL'
title=container.select('.result-title')[0].text
try:
price=container.select('.result-price')[0].text
except:
price="NULL"
print(date,location,title,price)
f.write(date+','+location.replace(","," ")+','+title.replace(","," ")+','+price+'\n')
i+=120
f.close()
I use while True to run endless loop and break to exit when there is no data
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
I use module csv to save data so I don't have to use replace(","," ").
It will put text in " " if there is , in text.
s={} can be in any place after ? so I put it at the end to make code more readable.
Portal gives first page even if you use s=0 so I don't have to check i == 0
(BTW: in my code it has more readable name offset)
Full code.
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = 'NULL'
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = "NULL"
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()

Modifying a scraped url and changing its extension

I am new to programming and trying to download images and PDFs from a website. In the source code, the items I need are in option tags with partial urls. The site lists these items in a drop-down menu and they display in an iframe, but each item can be opened on its own page using its full url.
So far, my code finds the options, appends the partial url to the page's base address to create the full url for each option, and removes the final " / " from the .tif and .TIF urls and adds a ".pdf".
However, for the .tif and .TIF urls, I need to change "convert" to "pdf" to open them in a new page. Is there a way to do this to only the .tif.pdf and .TIF.pdf urls while leaving the others unchanged?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import os
my_url = 'http://example.com'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
options = page_soup.findAll("select",{"id":"images"})[0].findAll("option")
values = [o.get("value") for o in options]
split_values = [i.split("|", 1)[0] for i in values]
# The option value is split to separate the url from its label
# <option value="/convert/ASRIMG/new/hop.TIF/|New Form"></option>
new_val = []
for val in split_values:
ext = os.path.splitext(val.rstrip('/'))[-1]
new_ext = ext
if ext.lower() == '.tif':
new_ext += '.pdf'
new_val.append(val.rstrip('/').replace(ext, new_ext))
for i in range (len(new_val)):
image_urls = ('http://example.com' + new_val[i])
My current results:
print (new_val)
/ASRIMG/good.jpg
/ASRIMG/foo/bar1.jpg
/ASRIMG/foo/bar2.jpg
/ASRIMG/foo/bar3.jpg
/convert/ASRIMG/new/hop.TIF.pdf
/convert/REG/green1.tif.pdf
/convert/REG//green2.tif.pdf
/convert/SHIP/green3.tif.pdf
/convert/SHIP/green4.tif.pdf
/convert/SHIP/green5.tif.pdf
/SKETCHIMG/001.png
/SKETCH/002.JPG
print (image_urls)
http://example.com/ASRIMG/good.jpg
http://example.com/ASRIMG/foo/bar1.jpg
http://example.com/ASRIMG/foo/bar2.jpg
http://example.com/ASRIMG/foo/bar3.jpg
http://example.com/convert/ASRIMG/new/hop.TIF.pdf
http://example.com/convert/REG/green1.tif.pdf
http://example.com/convert/REG//green2.tif.pdf
http://example.com/convert/SHIP/green3.tif.pdf
http://example.com/convert/SHIP/green4.tif.pdf
http://example.com/convert/SHIP/green5.tif.pdf
http://example.com/SKETCHIMG/001.png
http://example.com/SKETCH/002.JPG
What I need:
http://example.com/ASRIMG/good.jpg
http://example.com/ASRIMG/foo/bar1.jpg
http://example.com/ASRIMG/foo/bar2.jpg
http://example.com/ASRIMG/foo/bar3.jpg
http://example.com/pdf/ASRIMG/new/hop.TIF.pdf
http://example.com/pdf/REG/green1.tif.pdf
http://example.com/pdf/REG//green2.tif.pdf
http://example.com/pdf/SHIP/green3.tif.pdf
http://example.com/pdf/SHIP/green4.tif.pdf
http://example.com/pdf/SHIP/green5.tif.pdf
http://example.com/SKETCHIMG/001.png
http://example.com/SKETCH/002.JPG
After this step:
split_values = [i.split("|", 1)[0] for i in values]
This code handles both upper and lower tif:
In [48]: import os
In [49]: split_values = ['/ASRIMG/good.jpg', '/convert/ASRIMG/new/hop.TIF/', 'SK
...: ETCHIMG/001.png']
In [50]: new_val = []
In [51]: for val in split_values:
...: ext = os.path.splitext(val.rstrip('/'))[-1]
...: new_ext = ext
...: if ext.lower() == '.tif':
...: new_ext += '.pdf'
...: new_val.append(val.rstrip('/').replace(ext, new_ext))
...:
...:
This strips .tif/ from each value from split_values list from the right side and then adds .tif.pdf in the end

Resources