how to download and iterate over csv file - python-3.x

I'm trying to download and iterate over csv file but I'm only reading the headers but no more lines after it
tried using this answer but with no luck
this is my code:
from datetime import datetime
import requests
import csv
def main():
print("python main function")
datetime_object = datetime.now().date()
url = f'https://markets.cboe.com/us/equities/market_statistics/volume_reports/day/{datetime_object}/csv/?mkt=bzx'
print(url)
response = requests.get(url, stream=True)
csv_content = response.content.decode('utf-8')
print(csv_content)
cr = csv.reader(csv_content.splitlines(), delimiter='~')
my_list = list(cr)
for row in my_list:
print(row)
if __name__ == '__main__':
main()

cr = csv.reader(csv_content.splitlines(), delimiter='~')
change to
cr = csv.reader(csv_content.splitlines(), delimiter=',')
And check if You download full file or file with header only use URL in browser ;)

Related

CSV writer writes set to a single row rather than multiple rows

I am working on a web scraper for class. I basically have to compile all of the http links from a website and write them to a csv. They also need to be de-duplicated which is why I'm using a set. I have all the parts complete expect when it writes to the csv, the entire set of links writes to a single row rather than one link per row. Can someone review my code and tell me what i'm missing? I cannot find a solution anywhere.
My code is below:
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = set()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.add(abs_url)
with file:
write = csv.writer(file, delimiter = ',', lineterminator = '\r')
write.writerow(['List of Links'])
write.writerows([l])
file.close()
This is a printout of what's happening:
CSV Image
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = list()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.append(abs_url)
with file:
write = csv.writer(file)
write.writerow(['List of Links'])
for x in l:
write.writerow([x])
file.close()

How Can I Loop Through URLs and import TD elements from several Links

I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.

Python Web Scraping using BS

I have a web scraping program that gets multiple pages, but I have to set the while loop to a number. I want to make a condition that stops the loop once it reaches the last page or recognizes there are no more items to scrape. Assume I don't know how many pages exist. How do I change the while loop condition to make it stop without putting a random number?
import requests
from bs4 import BeautifulSoup
import csv
filename="output.csv"
f=open(filename, 'w', newline="",encoding='utf-8')
headers="Date, Location, Title, Price\n"
f.write(headers)
i=0
while i<5000:
if i==0:
page_link="https://portland.craigslist.org/search/sss?query=xbox&sort=date"
else:
page_link="https://portland.craigslist.org/search/sss?s={}&query=xbox&sort=date".format(i)
res=requests.get(page_link)
soup=BeautifulSoup(res.text,'html.parser')
for container in soup.select('.result-info'):
date=container.select('.result-date')[0].text
try:
location=container.select('.result-hood')[0].text
except:
try:
location=container.select('.nearby')[0].text
except:
location='NULL'
title=container.select('.result-title')[0].text
try:
price=container.select('.result-price')[0].text
except:
price="NULL"
print(date,location,title,price)
f.write(date+','+location.replace(","," ")+','+title.replace(","," ")+','+price+'\n')
i+=120
f.close()
I use while True to run endless loop and break to exit when there is no data
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
I use module csv to save data so I don't have to use replace(","," ").
It will put text in " " if there is , in text.
s={} can be in any place after ? so I put it at the end to make code more readable.
Portal gives first page even if you use s=0 so I don't have to check i == 0
(BTW: in my code it has more readable name offset)
Full code.
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = 'NULL'
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = "NULL"
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()

Python Web Scrape Unknown Number of Pages

I have working code that scrapes a single Craigslist page for specific information, but what would I need to add in order to grab the data from ALL of the pages (not knowing how many pages ahead of time)?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date"
uClient=uReq(my_url) #sends GET request to URL
page_html=uClient.read() #reads returned data and puts it in a variable
uClient.close() #close the connection
#create a file that we will want later to write parsed data to
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
#use BS to parse the webpage
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
for container in containers:
container_date=container.findAll('time',{'class','result-date'})
date=container_date[0].text
try:
container_location=container.findAll('span',{'class','result-hood'})
location=container_location[0].text
except:
try:
container_location=container.findAll('span',{'class','nearby'})
location=container_location[0].text
except:
location='NULL'
container_title=container.findAll('a',{'class','result-title'})
title=container_title[0].text
try:
container_price=container.findAll('span',{'class','result-price'})
price=container_price[0].text
except:
price='NULL'
#to print to screen
print('date:'+date)
print('location:'+location)
print('title:'+title)
print('price:'+price)
#to write to csv
f.write(date+','+location.replace(",","-")+','+title.replace(","," ")+','+price+'\n')
f.close()
Apart from what sir Andersson has already shown, you can do that as well for this site:
import requests
from bs4 import BeautifulSoup
import csv
page_link = "https://portland.craigslist.org/search/sss?s={}&query=electronics&sort=date"
for link in [page_link.format(page) for page in range(0,1147,120)]: #this is the fix
res = requests.get(link)
soup = BeautifulSoup(res.text,'lxml')
for container in soup.select('.result-info'):
try:
date = container.select('.result-date')[0].text
except IndexError:
date = ""
try:
title = container.select('.result-title')[0].text
except IndexError:
title = ""
try:
price = container.select('.result-price')[0].text
except IndexError:
price = ""
print(date,title,price)
with open("craigs_item.csv","a",newline="",encoding="utf-8") as outfile:
writer = csv.writer(outfile)
writer.writerow([date,title,price])
You can try to loop through all pages by handling "s" parameter in URL until you find page with no results (page with text "search and you will find"):
import requests
results_counter = 0
while True:
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date&s=%d" % results_counter
page_html = requests.get(my_url).text
if "search and you will find" in page_html:
break
else:
results_counter += 120
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
...

segmentation fault in python3

I am running python3 on a Ubuntu machine and have noticed that the following block of code is fickle. Sometimes it runs just fine, other times it produces a segmentation fault. I don't understand why. Can someone explain what might be going on?
Basically what the code does is try to read S&P companies from Wikipedia and write the list of tickers to a file in the same directory as the script. If no connection to Wikipedia can be established, the script tries instead to read an existing list from file.
from urllib import request
from urllib.error import URLError
from bs4 import BeautifulSoup
import os
import pickle
import dateutil.relativedelta as dr
import sys
sys.setrecursionlimit(100000)
def get_standard_and_poors_500_constituents():
fname = (
os.path.abspath(os.path.dirname(__file__)) + "/sp500_constituents.pkl"
)
try:
# URL request, URL opener, read content.
req = request.Request(
"http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
)
opener = request.urlopen(req)
# Convert bytes to UTF-8.
content = opener.read().decode()
soup = BeautifulSoup(content, "lxml")
# HTML table we actually need is the first.
tables = soup.find_all("table")
external_class = tables[0].findAll("a", {"class":"external text"})
c = [ext.string for ext in external_class if not "reports" in ext]
with open(fname, "wb") as f:
pickle.dump(c, f)
except URLError:
with open(fname, "rb") as f:
c = pickle.load(f)
finally:
return c
sp500_constituents = get_standard_and_poors_500_constituents()
spdr_etf = "SPY"
sp500_index = "^GSPC"
def main():
X = get_standard_and_poors_500_constituents()
print(X)
if __name__ == "__main__":
main()

Resources