Complex python3 csv scraper - python-3.x

I've got the code below working great when pulling data from a row, in my case row[0]. I'm wondering how to tweak it to pull data from multiple rows?
Also, I would love to be able to specify which divTag class (see the code below) to use for a specific column.
Something like for row[1,2] use:
divTag = soup.find("div", {"class": "productsPicture"})
and for row[4,5] use:
divTag = soup.find("div", {"class": "product_content"})
If that make sense to you guys.
from bs4 import BeautifulSoup
import requests
import csv
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile, delimiter=';')
writer = csv.writer(results)
for row in reader:
# get the url
url = row[0]
print(url)
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
writer.writerow([url, '', 'bad url'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", {"class": "productsPicture"})
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
writer.writerow([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
writer.writerow([url, url_sub, 'bad link'])
else:
writer.writerow([url, '', 'no results'])
urls.csv sample:
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E705Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E703Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E702Y-4589;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E706Y-9093;
Example classes to search for:

To add a per column find parameter, you could create a dictionary mapping the index number into the required find parameters as follows:
from bs4 import BeautifulSoup
import requests
import csv
class_1 = {"class": "productsPicture"}
class_2 = {"class": "product_content"}
class_3 = {"class": "id-fix"}
# map a column number to the required find parameters
class_to_find = {
0 : class_3, # Not defined in question
1 : class_1,
2 : class_1,
3 : class_3, # Not defined in question
4 : class_2,
5 : class_2}
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile)
writer = csv.writer(results)
for row in reader:
# get the url
output_row = []
for index, url in enumerate(row):
url = url.strip()
# Skip any empty URLs
if len(url):
#print('col: {}\nurl: {}\nclass: {}\n\n'.format(index, url, class_to_find[index]))
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
output_row.extend([url, '', 'bad url'])
continue
except requests.exceptions.MissingSchema as e:
output_row.extend([url, '', 'missing http...'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", class_to_find[index])
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
output_row.extend([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
output_row.extend([url, url_sub, 'bad link'])
else:
output_row.extend([url, '', 'no results'])
writer.writerow(output_row)
The enumerate() function is used to return a counter whist iterating over a list. So index will be 0 for the first URL, and 1 for the next. This can then be used with the class_to_find dictionary to get the required parameters to search on.
Each URL results in 3 columns being created, the url, the sub-url if successful and the result. These can be removed if not needed.

Related

Function for these parse elements will do not repeat. BeautifulSoup

Which function(or etc) is ideal so that these nicknames do not repeat on my parser. Dont know how to do that. I'l be very grateful if you help me.
Source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")
You can add all the names to a set.
A set object is an unordered collection of distinct hashable objects. Common uses include membership testing, removing duplicates from a sequence, and computing mathematical operations such as intersection, union, difference, and symmetric difference.
my_set = set()
# Lets add some elements to a set
my_set.add('a')
my_set.add('b')
print(my_set) # prints {'a', 'b'}
# Add one more 'a'
my_set.add('a')
print(my_set) # still prints {'a', 'b'} !
In your case, let's add all the names to a set and then write to the file after the for loop.
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
names = set()
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
names.add(playerName)
for name in names:
f.write(name + "\n")
f.close()
EDIT
Sets do not preserve the order. If you want to retain the order, just use lists.
...
names = []
while True:
...
for container in containers:
playerName = container.div.a.text.strip()
if playerName not in names:
names.append(playerName)
for name in names:
f.write(name + "\n")
f.close()

How to add exception to skip AttributeError in my code?

I was trying to skip AttributeError in my code because there is always atrribute error while crawling, especially when trying to get the value of title, and content. I have tried to put "except AttributeError" somewhere but never work. Could anybody help me? I am using python 3.6
from bs4 import BeautifulSoup
import requests
import pymysql.cursors
urls2 = []
result = requests.get("http://desaku.bandungkab.go.id/desaonline/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
links = soup.find_all('a')
urls = []
for link in links:
if "www" in link.text:
url = link.attrs['href']
urls.append(url)
num1=len(urls)
b=0
while b<10:
result2 = requests.get(urls[b])
src2 = result2.content
soup = BeautifulSoup(src2, 'lxml')
links2 = soup.find_all('a')
for link in links2:
if "selengkapnya" in link.text:
url2 = link.attrs['href']
urls2.append(url2)
b+=1
num=len(urls2)
i=0
while i<num:
html = requests.get(urls2[i])
src = html.content
soup = BeautifulSoup(src, 'lxml')
recordList = soup.findAll("div", {"class": "artikel", })
recordlist = soup.find_all('div', attrs={'class':'sampul2'})
connection = pymysql.connect(host='localhost',
user='root',
password='',
db='bs4-test',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
for record in recordList:
#except AttributeError:
#continue #WHERE TO PUT THIS EXCEPTION,TO SKIP ATRRIBUTEERRROR?
title = record.find("h2", {"class": "judul", }).get_text().strip()
date = record.find('i').next_sibling.next_sibling.next_sibling.replace('\t\t\t\t\t\t\t', '')
content = record.find("div", {"class":"teks"}).get_text().strip()
image = record.img['src']
cover = record.img['src']
sql = "INSERT INTO `artikel` (`jdl`, `tgl`, `kon`, `gambar`, `sampul`) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, (title, date, content, image, cover))
connection.commit()
print ("Record inserted successfully into table")
finally:
connection.close()
print("MySQL connection is closed")
i+=1
Just an example: if it is an url then append the url, if not then append null. Usually you would want the length of all lists to be the same, so that finally you could put them in a dataframe.
import numpy as np
links = soup.find_all('a')
urls = []
for link in links:
try:
url = link['href']
urls.append(url)
except:
urls.append(np.nan)

How Can I Loop Through URLs and import TD elements from several Links

I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.

Python Web Scraping using BS

I have a web scraping program that gets multiple pages, but I have to set the while loop to a number. I want to make a condition that stops the loop once it reaches the last page or recognizes there are no more items to scrape. Assume I don't know how many pages exist. How do I change the while loop condition to make it stop without putting a random number?
import requests
from bs4 import BeautifulSoup
import csv
filename="output.csv"
f=open(filename, 'w', newline="",encoding='utf-8')
headers="Date, Location, Title, Price\n"
f.write(headers)
i=0
while i<5000:
if i==0:
page_link="https://portland.craigslist.org/search/sss?query=xbox&sort=date"
else:
page_link="https://portland.craigslist.org/search/sss?s={}&query=xbox&sort=date".format(i)
res=requests.get(page_link)
soup=BeautifulSoup(res.text,'html.parser')
for container in soup.select('.result-info'):
date=container.select('.result-date')[0].text
try:
location=container.select('.result-hood')[0].text
except:
try:
location=container.select('.nearby')[0].text
except:
location='NULL'
title=container.select('.result-title')[0].text
try:
price=container.select('.result-price')[0].text
except:
price="NULL"
print(date,location,title,price)
f.write(date+','+location.replace(","," ")+','+title.replace(","," ")+','+price+'\n')
i+=120
f.close()
I use while True to run endless loop and break to exit when there is no data
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
I use module csv to save data so I don't have to use replace(","," ").
It will put text in " " if there is , in text.
s={} can be in any place after ? so I put it at the end to make code more readable.
Portal gives first page even if you use s=0 so I don't have to check i == 0
(BTW: in my code it has more readable name offset)
Full code.
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = 'NULL'
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = "NULL"
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()

Python Web Scrape Unknown Number of Pages

I have working code that scrapes a single Craigslist page for specific information, but what would I need to add in order to grab the data from ALL of the pages (not knowing how many pages ahead of time)?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date"
uClient=uReq(my_url) #sends GET request to URL
page_html=uClient.read() #reads returned data and puts it in a variable
uClient.close() #close the connection
#create a file that we will want later to write parsed data to
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
#use BS to parse the webpage
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
for container in containers:
container_date=container.findAll('time',{'class','result-date'})
date=container_date[0].text
try:
container_location=container.findAll('span',{'class','result-hood'})
location=container_location[0].text
except:
try:
container_location=container.findAll('span',{'class','nearby'})
location=container_location[0].text
except:
location='NULL'
container_title=container.findAll('a',{'class','result-title'})
title=container_title[0].text
try:
container_price=container.findAll('span',{'class','result-price'})
price=container_price[0].text
except:
price='NULL'
#to print to screen
print('date:'+date)
print('location:'+location)
print('title:'+title)
print('price:'+price)
#to write to csv
f.write(date+','+location.replace(",","-")+','+title.replace(","," ")+','+price+'\n')
f.close()
Apart from what sir Andersson has already shown, you can do that as well for this site:
import requests
from bs4 import BeautifulSoup
import csv
page_link = "https://portland.craigslist.org/search/sss?s={}&query=electronics&sort=date"
for link in [page_link.format(page) for page in range(0,1147,120)]: #this is the fix
res = requests.get(link)
soup = BeautifulSoup(res.text,'lxml')
for container in soup.select('.result-info'):
try:
date = container.select('.result-date')[0].text
except IndexError:
date = ""
try:
title = container.select('.result-title')[0].text
except IndexError:
title = ""
try:
price = container.select('.result-price')[0].text
except IndexError:
price = ""
print(date,title,price)
with open("craigs_item.csv","a",newline="",encoding="utf-8") as outfile:
writer = csv.writer(outfile)
writer.writerow([date,title,price])
You can try to loop through all pages by handling "s" parameter in URL until you find page with no results (page with text "search and you will find"):
import requests
results_counter = 0
while True:
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date&s=%d" % results_counter
page_html = requests.get(my_url).text
if "search and you will find" in page_html:
break
else:
results_counter += 120
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
...

Resources