data scraping from multiple pages - python-3.x

Below code works for single page scraping from olx, need help to extract from multiple pages.
import requests
import re
from django.conf.urls import url, include
url_base = "https://www.olx.in"
url = url_base + "/hyderabad_g4058526/cars_c5"
info_labels = ("itemDetails","itemPrice", "itemTitle", "item-location")
info_pattern = r'(?s)(.?)'
link_pattern = r'(?s)?data-aut-id="itemBox".*?href="([^"]+?)"'
response = requests.get(url)
cars = list(zip( *(re.findall(info_pattern.format(label), response.text) for label in info_labels),
(url_base + path for path in re.findall(link_pattern, response.text)) ))
print(cars)
with open('cars_olx.txt', 'w', encoding='utf-8') as f:
for item in cars:
f.write(u"%s\n" % str(item))

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

How can I get BeautifulSoup info in the same row?

I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')

CSV writer writes set to a single row rather than multiple rows

I am working on a web scraper for class. I basically have to compile all of the http links from a website and write them to a csv. They also need to be de-duplicated which is why I'm using a set. I have all the parts complete expect when it writes to the csv, the entire set of links writes to a single row rather than one link per row. Can someone review my code and tell me what i'm missing? I cannot find a solution anywhere.
My code is below:
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = set()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.add(abs_url)
with file:
write = csv.writer(file, delimiter = ',', lineterminator = '\r')
write.writerow(['List of Links'])
write.writerows([l])
file.close()
This is a printout of what's happening:
CSV Image
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = list()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.append(abs_url)
with file:
write = csv.writer(file)
write.writerow(['List of Links'])
for x in l:
write.writerow([x])
file.close()

By Beautiful Soup i scrape twitter data. I am able to get data but can't save in csv file

I scraped Twitter for user name, Tweets, replies, retweets but can't save in a CSV file.
Here is the code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "5_twitterBBC.csv"
f = open(file, "w")
Headers = "tweet_user, tweet_text, replies, retweets\n"
f.write(Headers)
for page in range(0,5):
url = "https://twitter.com/BBCWorld".format(page)
html = urlopen(url)
soup = BeautifulSoup(html,"html.parser")
tweets = soup.find_all("div", {"class":"js-stream-item"})
for tweet in tweets:
try:
if tweet.find('p',{"class":'tweet-text'}):
tweet_user = tweet.find('span',{"class":'username'}).text.strip()
tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
print(tweet_user, tweet_text, replies, retweets)
f.write("{}".format(tweet_user).replace(",","|")+ ",{}".format(tweet_text)+ ",{}".format( replies).replace(",", " ")+ ",{}".format(retweets) + "\n")
except: AttributeError
f.close()
I get data but can't save in CSV file. Someone explain me how to save data in CSV file.
As you can see, you've only made a small error in finding the tweets here tweets = soup.find_all("div", {"class":"js-stream-item"}), you forgot to pass on the argument key name which should be like this tweets = soup.find_all("div", attrs={"class":"js-stream-item"})
This is a working solution but it only fetches the first 20 tweets
from urllib.request import urlopen
from bs4 import BeautifulSoup
file = "5_twitterBBC.csv"
f = open(file, "w")
Headers = "tweet_user, tweet_text, replies, retweets\n"
f.write(Headers)
url = "https://twitter.com/BBCWorld"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# Gets the tweet
tweets = soup.find_all("li", attrs={"class":"js-stream-item"})
# Writes tweet fetched in file
for tweet in tweets:
try:
if tweet.find('p',{"class":'tweet-text'}):
tweet_user = tweet.find('span',{"class":'username'}).text.strip()
tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
# String interpolation technique
f.write(f'{tweet_user},/^{tweet_text}$/,{replies},{retweets}\n')
except: AttributeError
f.close()
filename = "output.csv"
f = open(filename, "w",encoding="utf-8")
headers = " tweet_user, tweet_text, replies, retweets \n"
f.write(headers)
***your code***
***loop****
f.write(''.join(tweet_user + [","] + tweet_text + [","] + replies + [","] + retweets + [","] + ["\n"]) )
f.close()

How Can I Loop Through URLs and import TD elements from several Links

I am trying to import data from the following URLs, and write each data set to a CSV file.
Here are a few sample URls that I want to grab fundamental data from:
https://finviz.com/quote.ashx?t=sbuc
https://finviz.com/quote.ashx?t=msft
https://finviz.com/quote.ashx?t=aapl
How can I import the data from 'Index' to 'Change'?
I think the script should, basically, look like this.
import csv
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup("html.parser")
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
for stocks in tckr:
url_list = [url_base + tckr]
with open('C:/Users/Excel/Desktop/today.csv', 'a', newline='') as file:
writer = csv.writer(file)
for url in url_list:
try:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except urllib.error.HTTPError:
print("{} - not found".format(url))
Except, when I run it, I get this error message: SyntaxError: EOL while scanning string literal
import csv
import requests
from bs4 import BeautifulSoup
url_base = "https://finviz.com/quote.ashx?t="
tckr = ['SBUX','MSFT','AAPL']
url_list = [url_base + s for s in tckr]
with open('../Python/SOtest.csv', 'a', newline='') as f:
writer = csv.writer(f)
for url in url_list:
try:
fpage = requests.get(url)
fsoup = BeautifulSoup(fpage.content, 'html.parser')
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
except HTTPError:
print("{} - not found".format(url))
I use requests so there is that difference. But it works so you can pull code from there if need be.

Resources