How can I get BeautifulSoup info in the same row? - python-3.x

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.

your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

Related

Python - Problem with saving as '.csv' extension

Functioning code:
import requests
from bs4 import BeautifulSoup
import pyautogui
import csv
url = 'https://url~~'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
total = soup.select('.gall_title')
searchList = []
contentList = []
for i in total:
searchList.append("https://url~~" + i.attrs['href'])
for i in searchList:
res2 = requests.get(i)
html2 = res2.text
soup2 = BeautifulSoup(html2, 'html.parser')
content_h = soup2.select('h3 > span.title_subject')
contentList.append(content_h)
print(contentList)
#save csv
f = open(1.csv', 'w', encoding='utf-8', newline='')
csvWriter = csv.writer(f)
for i in contentList:
csvWriter.writerow(i)
f.close()
★Result★
print(contentList):
# [[<span class="title_subject">tomato</span>], [<span class="title_subject">apple</span>]]
Image:
enter image description here
Non-functioning code:
import requests
from bs4 import BeautifulSoup
import pyautogui
import csv
url = 'https://url~~'
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
total = soup.select('.gall_title')
searchList = []
contentList = []
for i in total:
searchList.append("https://url~~" + i.attrs['href'])
for i in searchList:
res2 = requests.get(i)
html2 = res2.text
soup2 = BeautifulSoup(html2, 'html.parser')
content_h = str(soup2.select('h3 > span.title_subject')) // only changed
contentList.append(content_h)
print(contentList)
#save csv
f = open(1.csv', 'w', encoding='utf-8', newline='')
csvWriter = csv.writer(f)
for i in contentList:
csvWriter.writerow(i)
f.close()
★Result★
print(contentList):
['[tomato]', '[apple]']
Image:
enter image description here
How can I remove the issue where strings are being saved one character at a time in a '.csv' file?
soup2.select('h3 > span.title_subject') gives you the whole HTML tag. Extract its string value with .string:
strings = [x.string for x in soup2.select('h3 > span.title_subject')]

Bs4 scrape table specifik

I hope you guys are always healthy
I want to scrape a more specific table using BS4. this is my code:
from bs4 import BeautifulSoup
import requests
url = 'test.com'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
for row in soup.select('tbody tr'):
row_text = [x.text for x in row.find_all('td')]
print (row_text)
how do you get results like this:
Number, Name, address, telp, komoditi
1, "ABON JUARA" JUARA FOOD INDUSTRY, Jl. Jend Sudirman 339, Salatiga, Jawa Tengah, 0298-324060, Abon Sapi Dan Ayam
and saved in CSV
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select_one("table#newspaper-a").select("tr[valign=top]")
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["No", "Name", "Address", "Tel", "Komoditi"])
for item in target:
item = list(item.stripped_strings)
item[3] = item[3][6:]
writer.writerow(item)
main("https://kemenperin.go.id/direktori-perusahaan?what=&prov=&hal=1")
Output: view-online

Scraping all href links using Pagination

I've to Select each state from https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall and then click on team rankings and after that I've to grab href links of each ranked team.
I've completed till team rankings part now I want get links of each ranked team from all the pages in the pagination bar right now I'm getting links of all teams available on the first page only, I don't how to navigate to the next page.(below is the code)
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
site = "https://www.maxpreps.com"
url = requests.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(url.content, "html.parser")
states = soup.findAll('div', {'class': 'states'})
for each_state in states:
all_states = each_state.find_all('a', href=True)
for a in all_states:
domain = site + a['href'] #domain consist oflinks of states
for r in domain:
page_link = domain
page_response = requests.get(page_link)
soup = BeautifulSoup(page_response.content, "html.parser")
for link in soup.findAll('a', attrs={'href': re.compile("rankings")}):
rankings_link = site + link.get('href')
#print(rankings_link)
for ert in rankings_link:
team_link = rankings_link
page_response1 = requests.get(team_link)
soup = BeautifulSoup(page_response1.content, "html.parser")
My_table = soup.find('table',{'class':'mx-grid sortable rankings-grid'})
links = My_table.findAll('a')
print(links)
output:
Everett, Methuen,
You could just iterate through pages within the query parameters.
import requests
from bs4 import BeautifulSoup
site = "https://www.maxpreps.com"
session = requests.Session()
response = session.get("https://www.maxpreps.com/search/states_by_sport.aspx?gendersport=boys,football&season=fall")
soup = BeautifulSoup(response.content, "html.parser")
all_states = soup.find('div', {'class': 'states'})
states_list = []
for each in all_states.find_all('a'):
states_list.append(each['href'].split('=')[-1])
states_list = states_list[:-1]
team_links = []
url = 'https://www.maxpreps.com/m/rankings/list.aspx'
for state in states_list:
break_loop = False
page=1
while break_loop == False:
print ('%s: Page %s' %(state, page))
payload = {
'page': str(page),
'ssid': '8d610ab9-220b-465b-9cf0-9f417bce6c65',
'state': state
}
response = requests.get(url, params=payload)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find('table')
if table == None:
break_loop = True
else:
page+=1
links = table.find_all('a')
for link in links:
team_links.append('https://www.maxpreps.com' + link['href'])
Output:
print (team_links[:10])
['https://www.maxpreps.com/m/high-schools/central-red-devils-(phenix-city,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/thompson-warriors-(alabaster,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hoover-buccaneers-(hoover,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/oxford-yellow-jackets-(oxford,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mountain-brook-spartans-(birmingham,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/hewitt-trussville-huskies-(trussville,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/mcgill-toolen-yellowjackets-(mobile,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/lee-generals-(montgomery,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/pinson-valley-indians-(pinson,al)/football/default.htm', 'https://www.maxpreps.com/m/high-schools/vestavia-hills-rebels-(vestavia-hills,al)/football/default.htm']

Extracting Data from HTML Span using Beautiful Soup

I want to extract"1.02 Crores" and "7864" from html code and save them in different column in csv file.
Code:
<div class="featuresvap _graybox clearfix"><h3><span><i class="icon-inr"></i>1.02 Crores</span><small> # <i class="icon-inr"></i><b>7864/sq.ft</b> as per carpet area</small></h3>
Not sure about the actual data but this is just something that I threw together really quick. If you need it to navigate to a website then use import requests. you'' need to add url = 'yourwebpagehere' page = requests.get(url) and change soup to soup = BeautifulSoup(page.text, 'lxml') then remove the html variable since it would be unneeded.
from bs4 import BeautifulSoup
import csv
html = '<div class="featuresvap _graybox clearfix"><h3><span><i class="icon-inr"></i>1.02 Crores</span><small> # <i class="icon-inr"></i><b>7864/sq.ft</b> as per carpet area</small></h3>'
soup = BeautifulSoup(html, 'lxml')
findSpan = soup.find('span')
findB = soup.find('b')
print([findSpan.text, findB.text.replace('/sq.ft', '')])
with open('NAMEYOURFILE.csv', 'w+') as writer:
csv_writer = csv.writer(writer)
csv_writer.writerow(["First Column Name", "Second Column Name"])
csv_writer.writerow([findSpan, findB])
self explained in code
from bs4 import BeautifulSoup
# data for first column
firstCol = []
# data for second column
secondCol = []
for url in listURL:
html = '.....' # downloaded html
soup = BeautifulSoup(html, 'html.parser')
# 'select_one' select using CSS selectors, return only first element
fCol = soup.select_one('.featuresvap h3 span')
# remove: <i class="icon-inr"></i>
span.find("i").extract()
sCol = soup.select_one('.featuresvap h3 b')
firstCol.append(fCol.text)
secondCol.append(sCol.text.replace('/sq.ft', ''))
with open('results.csv', 'w') as fl:
csvContent = ','.join(firstCol) + '\n' + ','.join(secondCol)
fl.write(csvContent)
''' sample results
1.02 Crores | 2.34 Crores
7864 | 2475
'''
print('finish')

How to return href link which are not starting "\listing" using python

I am trying to scrape the links in https://www.panpages.my/search_results?q=
I have written Python script to get all links in each pages
I want to filter the links which is not starting as "\Listings"
Please find my script below and help me:
import requests
from bs4 import BeautifulSoup
import re
from io import StringIO
import csv
data = open("D:/Mine/Python/Projects/Freelancer/seekProgramming/rootpages.csv").read()
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
f = open('paylinks.csv', 'w', newline = '')
writer = csv.writer(f)
for row in csvReader:
myurl = row[0]
def simple_web_scrapper(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for root in soup.findAll('div', {"class": "mid_section col-xs-10 col-sm-7 tmargin xs-nomargin"}):
for link in root.findAll('a'):
href = link.get('href')
print(href)
simple_web_scrapper(myurl)
for row in csvReader:
myurl = row[0]
def simple_web_scrapper(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for root in soup.findAll('div', {"class": "mid_section col-xs-10 col-sm-7 tmargin xs-nomargin"}):
for link in root.findAll('a'):
href = link.get('href')
if href.startswith('\listings'): #that's the row you need
print(href)
simple_web_scrapper(myurl)

Resources