I'm trying to get the Urls (lnk) and the paragraphs (txt) extract form python script below into a csv with pandas.
For some reason the generated csv returns the headers (lnk and txt) and the Urls only, but not the corresponding paragraphs.
The CSV file returns currently that
lnk | txt
url 1 |
url 2 |
What I need would be
lnk | txt
url 1 | text 1
url 2 | text 2
But both the Urls and the paragraphs do get printed in the cmd console.
Why doesn't the paragraphs get exported into the csv as well?
What would be a working fix to this problem? thanks.
(sorry for the long code, I'm new to Python)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
url_txt = []
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#remove the text from this class
soup.find('p', {"class":"tiny-off"}).decompose()
#remove the text from this div id
soup.find('div', id = 'action-bar-top').decompose()
#remove the text from this div id
soup.find('div', id = 'main-content-sidebar').decompose()
#remove the text from this class
soup.find('div', {"class":"legal"}).decompose()
#get the 1st paragraph (which is a link)
for p in soup.find_all('p')[0]:
lnk = p.get_text()
print(lnk)
#remove the 1st paragraph (the link) from the following combined paragraphs
soup.find('p', id = 'current_url').decompose()
#extract all paragraphs save the 1st (the link)
for p in soup.find_all('p'):
txt = p.get_text().replace("\r", "").replace("\n", "")
print(txt)
# Compiling the info
lnktxt_data = [lnk, txt]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'txt'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)
I've found a simpler way that's working (with room for improvement),
with help from those two previous answers
How to join newlines into a paragraph in python
How to scrape web news and combine paragraphs into each article
Please let me know below how you would improve it if you find a better way.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#
for p in soup.find_all('p')[1]:
lnk = p.get_text()
print(lnk)
#
# find body and extract text
p = soup.find("div", attrs={'class': 'article-content retro-folders'})
p.append(p.get_text())
x = p.text
y = x.replace("\r", "").replace("\n", "")
print(y)
# Compiling the info
lnktxt_data = [lnk, y]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
url_txt = []
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'y'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)
recently started getting into web scraping and i have managed ok but now im stuck and i cant find the answer or figure it out.
Here is my code for scraping and exporting info from a single page
import requests
page = requests.get("https://www.example.com/page.aspx?sign=1")
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
print (heading, reading)
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([heading, reading, datetime.now()])
Problem occurs when i try to scrape multiple pages at the same time.
They are all the same just pagination changes eg
https://www.example.com/page.aspx?sign=1
https://www.example.com/page.aspx?sign=2
https://www.example.com/page.aspx?sign=3
https://www.example.com/page.aspx?sign=4 etc
Instead of writing the same code 20 times how do i stick all the data in a tuple or an array and export to csv.
Many thanks in advance.
Just try it out with a loop, until you got no page available (request is not OK). Should be easy to get.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
while True:
response = requests.get(f"https://www.example.com/page.aspx?sign={page_number}")
if response.status_code != 200:
break
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
# write a list
# results.append([heading, reading, datetime.now()])
# or tuple.. your call
results.append((heading, reading, datetime.now()))
page_number = page_number + 1
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for result in results:
writer.writerow(result)
I have tried reading a table from a website. It can be seen from my code that I have gone too far to get the table, I would appreciate if someone give me an opportunity to learn a quick method to do the same.
Here's my code:
import urllib.request
from bs4 import BeautifulSoup
url = "http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=9606&aa=1&style=N"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
text = soup.get_text()
with open('myfile.txt', 'w') as file:
file.writelines(text)
with open('myfile.txt','r') as g:
f = g.readlines()
tab = f[12:31]
table = [x.strip() for x in tab]
Every time running the code messes up with writing and reading the file.
You shouldn't need files. Filter for the pre tag instead, to target the table alone.
soup = BeautifulSoup(html)
text=soup.find('pre')
table = [x.strip() for x in text]
I have used one of the methods described here Python write to CSV line by line to attempt to write all the lines of my output to a .CSV. I've managed to get it to the stage of outputting and generating the CSV, but instead of showing all the lines of my data I am seeing one line, repeated 4 times and nothing else.
Can anyone see what the issue is here?
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
print(sheets)
with open('csvfile.csv','wt') as file:
for l in sheets:
file.writelines(sheets)
file.write('\n')
You probably want something more like the following untested code. The example provided can't be run as is:
from bs4 import BeautifulSoup
import requests
import csv
headers = {'User-Agent': 'Mozilla/5.0'}
# Open the file once. See the csv documentation for the correct way to open
# a file for use with csv.writer. If you plan to open the .csv with
# Excel, the utf-8-sig encoding will allow non-ASCII to work correctly.
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f) # actually use the CSV module.
for i in range(1, 300):
url = "xxx?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', class_='product-block__info')
for item in items:
product = item.find('span', class_='short_desc').text
stock = item.find('span', class_='count_product_stock hidden').text
brand = item.find('h4', class_='brand').text
price = item.find('span', class_='selling_price').text
# create a list of all the fields
sheets = [brand, product, stock, price]
# write a single line.
file.writerow(sheets)
Here's a tested example that will open in Excel. I threw in a non-ASCII character and a comma in the data to demonstrate the csv module's ability to handle it:
#coding:utf8
import csv
with open('csvfile.csv','w', encoding='utf-8-sig', newline='') as f:
file = csv.writer(f)
file.writerow('BRAND PRODUCT STOCK PRICE'.split())
for i in range(1,11):
sheets = ['brand{}'.format(i),'pröduct{}'.format(i),'st,ock{}'.format(i),'price{}'.format(i)]
file.writerow(sheets)
Output:
BRAND,PRODUCT,STOCK,PRICE
brand1,pröduct1,"st,ock1",price1
brand2,pröduct2,"st,ock2",price2
brand3,pröduct3,"st,ock3",price3
brand4,pröduct4,"st,ock4",price4
brand5,pröduct5,"st,ock5",price5
brand6,pröduct6,"st,ock6",price6
brand7,pröduct7,"st,ock7",price7
brand8,pröduct8,"st,ock8",price8
brand9,pröduct9,"st,ock9",price9
brand10,pröduct10,"st,ock10",price10
In Excel:
I get a list of links in the output file but need all of the links to show as absolute links. Some are absolute and others are relative. How do I append the base url to the relatives to ensure that I get only absolute links in the csv output?
I get back all the links but not all are absolute links e.g /subpage instead of http://page.com/subpage
from bs4 import BeautifulSoup
import requests
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
#only return links to subpages e.g. a tag that contains href
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("file.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
content = open('file.csv', 'r').readlines()
content_set = set(content)
cleandata = open('file.csv', 'w')
for line in content_set:
cleandata.write(line)
with urljoin:
from urlparse import urljoin
...
base_url = "http://cnn.com"
absolute_url = urljoin(base_url, relative_url)