How to output html content to command prompt using python - python-3.x

#bug: this program should output url contents but instead outputs the urls themselves
import csv
import requests
import json
with open('PluginIndex.csv', newline='') as csvfile: #opens the plugin index file and stores in var "csvfile"
reader = csv.DictReader(csvfile) #reads the contents of csvfile and stores it
for row in reader: #for each row of text in the contents of csv
url = row["repo"]
print(url) #outputs the repo url of each csv row
resp = requests.get(url)
data = json.loads(resp.text)
data.keys()
I need the content of each url in a csv file to be outputted to the command prompt, however the requests and json libraries are not enough. Does anyone know a way to achieve this?

Related

print() command does not contain all expected values

I am reading data from .csv data and want to write parts of that data into an output file.
When I execute the program and print the results, I get the complete data set of the input file.
However, when I hit print() again, only the last line of the input file is shown.
When I write the print-result into another csv file, as well only the last line is transfered
Basically I am new at this and struggle to understand how data is stored in cache and passed on.
import csv
with open("path to the input file") as file:
reader = csv.reader(file)
for line in file:
input_data = line.strip().split(";")
print(input_data)
with open(os.path.join("path to the output file"), "w") as file1:
toFile = input_data
file1.write(str(toFile))
There is no error messages, just not the expected result. I expect 10 lines to be transferred, but only the last makes it to the output .csv
Thank you for your help!
When you loop over the lines in the csv each iteration you assign the value of that line to input data, overwriting the value previously stored in input_data.
I would recommend something like the following:
import csv
with open('path to input file', 'r') as input, open('path to output file', 'w') as output:
reader = csv.reader(input)
for line in reader:
ouput.write(line.strip().split(';'))
you can open multiple files in a single with clause like I showed in the example. Then for each line in the file you write the stripped and split string to the file.
This should do it. You created the reader object correctly but didn't use it. I hope my example will better your understanding of the reader class.
#!/usr/bin/env python
import csv
from os import linesep
def write_csv_to_file(csv_file_name, out_file_name):
# Open the csv-file
with open(csv_file_name, newline='') as csvfile:
# Create a reader object, pass it the csv-file and tell it how to
# split up values
reader = csv.reader(csvfile, delimiter=',', quotechar='|')
# Open the output file
with open(out_file_name, "w") as out_file:
# Loop through the rows that the reader found
for row in reader:
# Join the row values using a comma as separator
r = ', '.join(row)
# Print row and write to output file
print(r)
out_file.write(r + linesep)
if __name__ == '__main__':
csv_file = "example.csv"
out_file = "out.txt"
write_csv_to_file(csv_file, out_file)

How to read CSV File dwnloaded from NSE

My Code:
import csv
import requests
url = 'https://xxxxxxxxxxxxxxxxxxx.csv'
r = requests.get(url)
text = r.iter_lines()
reader = csv.reader(text, delimiter=',')
for row in reader:
print(row)
I am getting the following error:
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
Just save the csv separately and then read in with pandas:
import pandas
pd.read_csv('SO/AN_LATEST_ANNOUNCED.csv')
Result:
(To save the file separately just open the link in a browser.)

save web scraped data to a txt file

I am trying to save data that I have already scraped from the Yew York times web page, to a txt file.
import urllib.request
from bs4 import BeautifulSoup
# URL
html_page = 'https://www.nytimes.com/'
page = urllib.request.urlopen(html_page)
soup = BeautifulSoup(page, "html.parser")
title_box = soup.findAll("h2", class_= "css-bzeb53 esl82me2")
print(title_box)
# Extract titles from list
titles = []
for occurence in title_box:
titles.append(occurence.text.strip())
print(titles)
works fine up to this point but i cant manage to create/save to data to a txt file.
# Save the Headlines
filename = '/home/stephan/Documents/NYHeads.txt'
with open(filename, 'w') as file_object:
file_object.write(titles)
The problem is when you are trying to write to file, it has to be a string. In your program titles is a list. You need to convert titles to string. This should work:
filename = '/home/stephan/Documents/NYHeads.txt'
with open(filename, 'w') as file_object:
file_object.write(str(titles))

Python 3 - Read CSV from URL after testing for header

I'm trying to generate records from a csv retrieved from a given url in Python 3.
Given urlopen returns a bytes-mode file object but csv.DictReader expects text-mode file objects, I've had to wrap the urlopen file object in a TextIOWrapper (with a utf-8 decoding). Now, unfortunately I'm stuck between two undesirable options:
1) TextIOWrapper doesn't support seek, so I can't reset the csv_file generator after checking for a header with Sniffer.
2) If I don't seek back to 0, I truncate the first 18 records.
How do I modify the below code so that it can both check for headers and yield all records off of one urlopen call?
What have I tried?
Reading the URL twice: once to check for headers, a second time to generate the csv records. This seems suboptimal.
Code that skips the first 18 records below. Uncomment line 12 to generate the seek error.
import csv
import io
import urllib.request
def read_url_csv(url, fieldnames=None, transform=None):
if transform is None:
transform = lambda x, filename, fieldnames: x
with urllib.request.urlopen(url) as csv_binary:
csv_file = io.TextIOWrapper(csv_binary, "utf-8")
has_header = csv.Sniffer().has_header(csv_file.read(1024))
#csv_file.seek(0)
reader = csv.DictReader(csv_file, fieldnames=fieldnames)
if has_header and fieldnames is not None: #Overwriting
next(reader)
for record in reader:
yield transform(record, url, fieldnames)
StringIO supports seeking.
csv_file = io.StringIO(io.TextIOWrapper(csv_binary, "utf-8").read())
has_header = csv.Sniffer().has_header(csv_file.read(1024))
csv_file.seek(0)

Python CSV exporting 4 identical rows

I have used one of the methods described here Python write to CSV line by line to attempt to write all the lines of my output to a .CSV. I've managed to get it to the stage of outputting and generating the CSV, but instead of showing all the lines of my data I am seeing one line, repeated 4 times and nothing else.
Can anyone see what the issue is here?
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
print(sheets)
with open('csvfile.csv','wt') as file:
for l in sheets:
file.writelines(sheets)
file.write('\n')
You probably want something more like the following untested code. The example provided can't be run as is:
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
# Open the file once. See the csv documentation for the correct way to open
# a file for use with csv.writer. If you plan to open the .csv with
# Excel, the utf-8-sig encoding will allow non-ASCII to work correctly.
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f) # actually use the CSV module.
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
# write a single line.
file.writerow(sheets)
Here's a tested example that will open in Excel. I threw in a non-ASCII character and a comma in the data to demonstrate the csv module's ability to handle it:
#coding:utf8
import csv
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f)
file.writerow('BRAND PRODUCT STOCK PRICE'.split())
for i in range(1,11):
sheets = ['brand{}'.format(i),'pröduct{}'.format(i),'st,ock{}'.format(i),'price{}'.format(i)]
file.writerow(sheets)
Output:
BRAND,PRODUCT,STOCK,PRICE
brand1,pröduct1,"st,ock1",price1
brand2,pröduct2,"st,ock2",price2
brand3,pröduct3,"st,ock3",price3
brand4,pröduct4,"st,ock4",price4
brand5,pröduct5,"st,ock5",price5
brand6,pröduct6,"st,ock6",price6
brand7,pröduct7,"st,ock7",price7
brand8,pröduct8,"st,ock8",price8
brand9,pröduct9,"st,ock9",price9
brand10,pröduct10,"st,ock10",price10
In Excel:

Resources