import requests
from bs4 import BeautifulSoup
from lxml import etree
import csv
with open('1_colonia.csv', 'r', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';')
next(reader) # skip the header row
for row in reader:
url = row[0]
page = requests.get(url)
# parse the html with BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
# parse the HTML and print the result to the console
dom = etree.HTML(str(soup))
property = (dom.xpath('//*[#id="header"]/div/div[2]/h1'))
duration = (dom.xpath('//*[#id="header"]/div/p'))
price = (dom.xpath('//*[#id="price"]/div/div/span/span[3]'))
# save the data to a CSV file, adding the url as a column to the CSV file
with open('2_colonia.csv', 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
writer.writerow([url, property[0].text, duration[0].text,price[0].text])
'1_colonia.csv' contains a list of 815 links of properties on sale.
The script works until this message appears:
Traceback (most recent call last):
File "/home/flimflam/Python/colonia/2_colonia.py", line 23, in <module>
writer.writerow([url, property[0].text, duration[0].text, price[0].text])
IndexError: list index out of range
I am not sure where the problem lies. Can anyone help me out, please?
Thanks,
xpath returns lists (for the kind of expression you are using), so in your script property, duration and price are lists.
Depending on what you're searching, xpath can return 0, 1 or multiple elements.
So you must check whether there are results on the list before accessing them. If the list is empty and you try to access the first element (as in property[0], for instance) you will get an exception.
A simple way of checking if there's data on your lists before writing to the csv file would be:
with open('2_colonia.csv', 'a', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=';')
# check if the lists are not empty
if len(property) > 0 and len(duration) > 0 and len(price) > 0:
writer.writerow([url, property[0].text, duration[0].text, price[0].text])
else:
writer.writerow([url, 'error'])
Related
I am analyzing xml-structured Textfiles about insider dealings. I wrote some code to parse through the XML-structure and write my output in a CSV file. The results of the files are written per line and the analyzed information is written in individual columns. But in some files information is present in multiple times and my code override the information in the cells, in the end only one date is in the cell of my CSV-File.
import csv
import glob
import re
import string
import time
import bs4 as bs
# User defined directory for files to be parsed
TARGET_FILES = r'D:\files\'
# User defined file pointer to LM dictionary
# User defined output file
OUTPUT_FILE = r'D:\ouput\Parser.csv'
# Setup output
OUTPUT_FIELDS = [r'Datei', 'transactionDate', r'transactionsCode', r'Director', r'Officer', r'Titel', r'10-% Eigner', r'sonstiges', r'SignatureDate']
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n', delimiter=';')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
soup = bs.BeautifulSoup(f_in, 'xml')
output_data = get_data(soup)
output_data[0] = file
wr.writerow(output_data)
def get_data(soup):
# overrides the transactionDate if more than one transactions disclosed on the current form
# the number determine the column for the output
_odata = [0] * 9
try:
for item in soup.find_all('transactionDate'):
_odata[1] = item.find('value').text
except AttributeError:
_odata[1] = ('keine Angabe')
try:
for item in soup.find_all('transactionAcquiredDisposedCode'):
_odata[2] = item.find('value').text
except AttributeError:
_odata[2] = 'ka'
for item in soup.find_all('reportingOwnerRelationship'):
try:
_odata[3] = item.find('isDirector').text
except AttributeError:
_odata[3] = ('ka')
try:
_odata[4] = item.find('isOfficer').text
except AttributeError:
_odata[4] = ('ka')
try:
_odata[5] = item.find('officerTitle').text
except AttributeError:
_odata[5] = 'ka'
try:
_odata[6] = item.find('isTenPercentOwner').text
except AttributeError:
_odata[6] = ('ka')
try:
_odata[7] = item.find('isOther').text
except AttributeError:
_odata[7] = ('ka')
try:
for item in soup.find_all('ownerSignature'):
_odata[8] = item.find('signatureDate').text
except AttributeError:
_odata[8] = ('ka')
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nGeneric_Parser.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
Actually the code works, but overwrites columns if, for e.g. more than one transacion date is given in the file. So I need a code that automatically uses the next column for each transaction date. How could this work?
I would be glad if someone have a solution for my problem. Thanks a lot!
Your issue is that you are iterating over the result of
soup.find_all()
and every time you are writing to the same value. You need to do something with
_odata in each iteration, otherwise you will only end up with whatever is written to it the last time.
If you can show us what the data you're trying to parse actually looks like, perhaps we could give a more specific answer.
I have CSV file with some links stored in one of the columns. I want to read only links and print them out. I tried to use following code but output is none.
import csv
filename ='abc.csv'
with open(filename,'rb') as f:
reader = csv.reader(f)
for row in reader:
for item in row:
if item.startswith('http'):
print(item)
import csv
with open ('abc.csv','r') as csv_file:
csv_reader = csv.reader(csv_file)
for line in csv_reader:
if line[0].startswith('http'):
print(line)
If you want to make sure that the line starts with for example "http" you should write:
line[0].startswith("http")because the first element of the line list will be a string.
I have written a script to scrape data from a website. It has 2 columns. But I want to add another column to it (abstract column). How can I do this inside the same loop? I need to get the 'abstract' data in the third column. Image attached below.
The code is below:
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https:/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
for title in soup.select('div.title > h1'):
csvriter.writerow([title.strip() for title in
title.text.split(':')]);
According to your description, I guess abstract and category, vulnerability maybe have the common father div element.
Then I try to find the common div and extract data in every loop, finally, I verified my guess, I also add default value for vulnerability when title has no vulnerability content
The following code run successfully
import requests
import csv
from bs4 import BeautifulSoup
file = "Details181.csv"
Headers = ["Category", "Vulnerabilities", "Abstract"]
url = "https://vulncat.fortify.com/en/weakness?po={}"
with open(file, 'w', newline='') as f:
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1, 131):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.text, 'lxml')
# find the common father div info
all_father_info = soup.find_all("div", class_="detailcell weaknessCell panel")
for father in all_father_info:
# find the son div info, then extract data
son_info_12 = father.find('h1').text.split(":")
if len(son_info_12) == 2:
category, vulnerability = son_info_12[0].strip(), son_info_12[1].strip()
elif len(son_info_12) == 1:
category = son_info_12[0].strip()
vulnerability = ""
else:
category, vulnerability = "", ""
# find the son div info, then extract abstract
abstract = father.find("div", class_="t").text.strip()
# write data into csv file
csvriter.writerow([category, vulnerability, abstract])
I'm trying to generate records from a csv retrieved from a given url in Python 3.
Given urlopen returns a bytes-mode file object but csv.DictReader expects text-mode file objects, I've had to wrap the urlopen file object in a TextIOWrapper (with a utf-8 decoding). Now, unfortunately I'm stuck between two undesirable options:
1) TextIOWrapper doesn't support seek, so I can't reset the csv_file generator after checking for a header with Sniffer.
2) If I don't seek back to 0, I truncate the first 18 records.
How do I modify the below code so that it can both check for headers and yield all records off of one urlopen call?
What have I tried?
Reading the URL twice: once to check for headers, a second time to generate the csv records. This seems suboptimal.
Code that skips the first 18 records below. Uncomment line 12 to generate the seek error.
import csv
import io
import urllib.request
def read_url_csv(url, fieldnames=None, transform=None):
if transform is None:
transform = lambda x, filename, fieldnames: x
with urllib.request.urlopen(url) as csv_binary:
csv_file = io.TextIOWrapper(csv_binary, "utf-8")
has_header = csv.Sniffer().has_header(csv_file.read(1024))
#csv_file.seek(0)
reader = csv.DictReader(csv_file, fieldnames=fieldnames)
if has_header and fieldnames is not None: #Overwriting
next(reader)
for record in reader:
yield transform(record, url, fieldnames)
StringIO supports seeking.
csv_file = io.StringIO(io.TextIOWrapper(csv_binary, "utf-8").read())
has_header = csv.Sniffer().has_header(csv_file.read(1024))
csv_file.seek(0)
I have used one of the methods described here Python write to CSV line by line to attempt to write all the lines of my output to a .CSV. I've managed to get it to the stage of outputting and generating the CSV, but instead of showing all the lines of my data I am seeing one line, repeated 4 times and nothing else.
Can anyone see what the issue is here?
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
print(sheets)
with open('csvfile.csv','wt') as file:
for l in sheets:
file.writelines(sheets)
file.write('\n')
You probably want something more like the following untested code. The example provided can't be run as is:
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
# Open the file once. See the csv documentation for the correct way to open
# a file for use with csv.writer. If you plan to open the .csv with
# Excel, the utf-8-sig encoding will allow non-ASCII to work correctly.
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f) # actually use the CSV module.
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
# write a single line.
file.writerow(sheets)
Here's a tested example that will open in Excel. I threw in a non-ASCII character and a comma in the data to demonstrate the csv module's ability to handle it:
#coding:utf8
import csv
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f)
file.writerow('BRAND PRODUCT STOCK PRICE'.split())
for i in range(1,11):
sheets = ['brand{}'.format(i),'pröduct{}'.format(i),'st,ock{}'.format(i),'price{}'.format(i)]
file.writerow(sheets)
Output:
BRAND,PRODUCT,STOCK,PRICE
brand1,pröduct1,"st,ock1",price1
brand2,pröduct2,"st,ock2",price2
brand3,pröduct3,"st,ock3",price3
brand4,pröduct4,"st,ock4",price4
brand5,pröduct5,"st,ock5",price5
brand6,pröduct6,"st,ock6",price6
brand7,pröduct7,"st,ock7",price7
brand8,pröduct8,"st,ock8",price8
brand9,pröduct9,"st,ock9",price9
brand10,pröduct10,"st,ock10",price10
In Excel: