I am trying to import videoId from a pickle folder and then print the description of the youtube video.and i am getting this error ='bytes' object has no attribute 'findAll'i want to extract the description with videoid and title in csv
cwd=os.getcwd()
parent_folder=os.path.join(cwd,'Data') pickle_out=open("C:/Users/india/Desktop/PARUL/Data/Pickle/vid_ids_dict.pickle","rb")
vid_id_dict=pickle.load(pickle_out)
dataset_folder=os.path.join(parent_folder,"Dataset")
if not os.path.exists(dataset_folder):
os.makedirs(dataset_folder)
csv_file_path= os.path.join(parent_folder,'main.csv')
base = "https://www.youtube.com/watch?v="
for keys, values in vid_id_dict.items():
for key in keys:
query_dataset_folder=os.path.join(dataset_folder,key)
if not os.path.exists(query_dataset_folder):
os.makedirs(query_dataset_folder)
for VidID in values:
r = requests.get(base+VidID)
soup = bs(r.text,'html.parser').encode("utf-8")
name=VidID+".txt"
save_description_link=os.path.join(query_dataset_folder,name)
f= open(save_description_link,"a+")
for title in soup.findAll('p', attrs={'id': 'eow-description'}):
description=title.text.strip()
f.write(description)
print(description)
f.close()
for title in soup.findAll('span', attrs={'class': 'watch-title'}):
vid_title= title.text.strip()
print(vid_title)
with open(csv_file_path, 'a+') as csvfile:
fieldnames = ['Video id', 'Title','Description','Category']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'Video id': VidID, 'Title': vid_title, 'Description':description,'Category':key})
ERROR-
AttributeError Traceback (most recent call last)
in
35 f= open(save_description_link,"a+")
36
---> 37 for title in soup.findAll('p', attrs={'id': 'eow-
description'}):
38 description=title.text.strip()
39 f.write(description)
AttributeError: 'bytes' object has no attribute 'findAll'
I'm assume that you're using requests module for getting the http responses.
Just replace your code with:
r.encoding = 'utf-8'
soup = bs(req.content, 'html.parser')
Final Running code:
for keys, values in vid_id_dict.items():
for key in keys:
query_dataset_folder=os.path.join(dataset_folder,key)
if not os.path.exists(query_dataset_folder):
os.makedirs(query_dataset_folder)
for VidID in values:
r = requests.get(base+VidID)
r.encoding = 'utf-8'
soup = bs(r.content, 'html.parser')
name=VidID+".txt"
save_description_link=os.path.join(query_dataset_folder,name)
f= open(save_description_link,"a+")
for title in soup.findAll('p', attrs={'id': 'eow-description'}):
description=title.text.strip()
f.write(description)
print(description)
f.close()
for title in soup.findAll('span', attrs={'class': 'watch-title'}):
vid_title= title.text.strip()
print(vid_title)
with open(csv_file_path, 'a+') as csvfile:
fieldnames = ['Video id', 'Title','Description','Category']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'Video id': VidID, 'Title': vid_title, 'Description':description,'Category':key})
Related
The code below gives me the following error:
ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements
on the df.columns = ["GP Practice Name"] line.
I tried
import pandas as pd
import requests
from bs4 import BeautifulSoup
postal_codes = ["2000", "2010", "2020", "2030", "2040"]
places_by_postal_code = {}
def get_places(postal_code):
url = f"https://www.yellowpages.com.au/search/listings?clue={postal_code}&locationClue=&latitude=&longitude=&selectedViewMode=list&refinements=category:General%20Practitioner&selectedSortType=distance"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
places = soup.find_all("div", {"class": "listing-content"})
return [place.find("h2").text for place in places]
for postal_code in postal_codes:
places = get_places(postal_code)
places_by_postal_code[postal_code] = places
df = pd.DataFrame.from_dict(places_by_postal_code, orient='index')
df.columns = ["GP Practice Name"]
df = pd.DataFrame(places_by_postal_code.values(), index=places_by_postal_code.keys(), columns=["GP Practice Name"])
print(df)
and was expecting a list of GPs for the postcodes specified in the postal_codes variable.
Below example doesn't seem to work
import csv
values = [{'emp_name': 'John Wick',
'dept': 'Accounting', 'val': '10'},
{'emp_name': 'Neo Anderson',
'dept': 'IT', 'val': '20'}]
with open('file1.csv', mode='w') as csv_file:
fieldnames = ['name', 'department']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
for value in values:
writer.writerow({"'name': '{emp_name}', 'department': '{dept}'".format(
**value)
})
Error:
Traceback (most recent call last):
File "csv_test.py", line 13, in <module>
**value)
File "python-3.7.5/lib/python3.7/csv.py", line 155, in writerow
return self.writer.writerow(self._dict_to_list(rowdict))
File "python-3.7.5/lib/python3.7/csv.py", line 148, in _dict_to_list
wrong_fields = rowdict.keys() - self.fieldnames
AttributeError: 'set' object has no attribute 'keys'
Expected Output:
name, department
John Wick, Accounting
Neo Anderson, IT
Is there a good workaround for this?
Use pass the dict when using writerow
Ex:
import csv
values = [{'emp_name': 'John Wick',
'dept': 'Accounting'},
{'emp_name': 'Neo Anderson',
'dept': 'IT'}]
with open('file1.csv', mode='w') as csv_file:
fieldnames = ['emp_name', 'dept']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
for value in values:
writer.writerow({"emp_name": value['emp_name'], "dept": value['dept']}) #Pass the dict
While running script with one tax_id the output is given without any problem. But when I am adding multiple tax_ids this 'TypeError: 'NoneType' object is not callable' error message is appeared.
import requests
from bs4 import BeautifulSoup as bs
import re
url = 'https://www.e-taxes.gov.az/controller'
tax_ids = [
'1306142621',
'1403676911',
]
request_payloads = {"METHOD": "wsEbynGetDebetSum",
"voen": tax_ids,
}
with requests.Session() as s:
for tax_id in tax_ids:
request_payloads['voen'] = tax_id
r = s.post(url, params=request_payloads)
url_t = r.url
response = requests.get(url_t)
s = bs(response.content, 'lxml')
output = s.text.strip()
debt = re.search('"debet":"(.+?)"}}', output)
if debt:
m = debt.group(1)
print([tax_id] + [m])
The error message is
Traceback (most recent call last):
File "taxDebt.py", line 19, in <module>
r = s.post(url, params=request_payloads)
TypeError: 'NoneType' object is not callable
s = bs(response.content, 'lxml')
This line is overwriting your session variable declared here:
with requests.Session() as s
Change the s = bs(...) variable name to something else:
with requests.Session() as s:
for tax_id in tax_ids:
request_payloads['voen'] = tax_id
r = s.post(url, params=request_payloads)
url_t = r.url
response = requests.get(url_t)
s1 = bs(response.content, 'lxml')
output = s1.text.strip()
debt = re.search('"debet":"(.+?)"}}', output)
if debt:
m = debt.group(1)
print([tax_id] + [m])
Result:
['1306142621', '0.00']
['1403676911', '0.00']
I want to check the price of a item from bestbuy website, however, the access is denied. Does anyone have some advice how to access? Thanks!
My code:
import requests
import bs4 as bs
url = "https://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?skuId=6360611"
url_get = requests.get(url)
soup = bs.BeautifulSoup(url_get.content, 'lxml')
with open('url_bestbuy.txt', 'w', encoding='utf-8') as f_out:
f_out.write(soup.prettify())
js_test = soup.find('span', id ='priceblock_ourprice')
if js_test is None:
js_test = soup.find('span', id ='div.price-block')
str = ""
for line in js_test.stripped_strings :
str = line
# convert to integer
str = str.replace(", ", "")
str = str.replace("$", "")
current_price = int(float(str))
your_price = 2000
if current_price < your_price :
print("I can afford it")
else:
print("Price is high please wait for the best deal")
You don't have permission to access "http://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?" on this server.
My web scraping script is returning duplicate results for some reason, i've tried so many alternatives, but just can't get it to work whatsoever. Can anyone help please?
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import csv
soup = [ ]
pages = [ ]
csv_file = open('444.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Practice', 'Practice Manager'])
for i in range(35899, 35909):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = []
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
else:
print('no-business')
names = []
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
else:
print('no-name')
print(business, names)
csv_writer.writerow([business, names])
csv_file.close()
It's currently returning duplicate values on all.
What it needs to do is return one 'business' and one 'names' value per url call. If there is no 'business' or 'name', it needs to return a value of 'no-business' or 'no-name'.
Can anyone please help me?
I don't know if it's the best way of doing it, but i used set instead of list to remove duplicates and just before saving the file i convert the set to a list like this :
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import csv
soup = [ ]
pages = [ ]
csv_file = open('444.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Practice', 'Practice Manager'])
for i in range(35899, 35909):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = set()
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.add(tag.text)
else:
print('no-business')
names = set()
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.add(tag.text)
else:
print('no-business')
print(business, names)
csv_writer.writerow([list(business), list(names)])
csv_file.close()
You could use the following id to generate the initial list of lists. You could write each row to csv rather than append to final list.
import requests
from bs4 import BeautifulSoup as bs
results = []
with requests.Session() as s:
for i in range(35899, 35909):
r = s.get('https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i))
soup = bs(r.content, 'lxml')
row = [item.text for item in soup.select('.staff-title:has(em:contains("Practice Manager")) [id]')]
if not row: row = ['no practice manager']
practice = soup.select_one('.gp').text if soup.select_one(':has(#org-title)') else 'No practice name'
row.insert(0, practice)
results.append(row)
print(results)
Not sure how you want listing out for multiple names
import requests
from bs4 import BeautifulSoup as bs
import csv
with open('output.csv', 'w', newline='') as csvfile:
w = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
with requests.Session() as s:
for i in range(35899, 35909):
r = s.get('https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i))
soup = bs(r.content, 'lxml')
row = [item.text for item in soup.select('.staff-title:has(em:contains("Practice Manager")) [id]')]
if not row: row = ['no practice manager']
practice = soup.select_one('.gp').text if soup.select_one(':has(#org-title)') else 'No practice name'
row.insert(0, practice)
w.writerow(row)
Looks like the problem stems from the fact that, in some of these pages, there is no information at all, and you get a "Profile Hidden" error. I modified your code somewhat, to cover the first 5 pages. Aside from saving to file, it looks like this:
[same imports]
pages = [ ]
for i in range(35899, 35904):
url = 'https://www.nhs.uk/Services/GP/Staff/DefaultView.aspx?id=' + str(i)
pages.append(url)
soup = [ ]
for item in pages:
page = requests.get(item)
soup.append(bs(page.text, 'lxml'))
business = []
for items in soup:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
names = []
for items in soup:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
for bus, name in zip(business,names):
print(bus,'---',name)
The output looks like this:
Bilbrook Medical Centre --- Di Palfrey
Caversham Group Practice --- Di Palfrey
Caversham Group Practice --- Di Palfrey
The Moorcroft Medical Ctr --- Ms Kim Stanyer
Brotton Surgery --- Mrs Gina Bayliss
Notice that only the 2nd and 3rd entries are duplicated; that is (somehow, not sure why) caused by the "Hidden Profile" in the third page. So if you modify the main blocks of the code to:
business = []
for items in soup:
if "ProfileHiddenError.aspx" in (str(items)):
business.append('Profile Hidden')
else:
h1Obj = items.select('[class^=panel]:has([class^="gp notranslate"]:contains(""))')
for i in h1Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h1':
business.append(tag.text)
names = []
for items in soup:
if "ProfileHiddenError.aspx" in (str(items)):
names.append('Profile Hidden')
elif not "Practice Manager" in str(items):
names.append('No Practice Manager Specified')
else:
h4Obj = items.select('[class^=panel]:not(p):has([class^="staff-title"]:contains("Practice Manager"))')
for i in h4Obj:
tagArray = i.findChildren()
for tag in tagArray:
if isinstance(tag,Tag) and tag.name in 'h4':
names.append(tag.text)
for bus, name in zip(business,names):
print(bus,'---',name)
The output, this time is:
BBilbrook Medical Centre --- Di Palfrey
Caversham Group Practice --- No Practice Manager Specified
Profile Hidden --- Profile Hidden
The Moorcroft Medical Ctr --- Ms Kim Stanyer
Brotton Surgery --- Mrs Gina Bayliss
Hopefully this would help you to troubleshoot the problem.