AttributeError: 'WebElement' object has no attribute 'extract_first' - python-3.x

I am trying to run the script below to extract the tags from a webpage and save them into a csv file.
In details, I want to extract the tags associated to a class name.
However, I come across this error: AttributeError:
'WebElement' object has no attribute 'extract_first'.
The script is the following:
import csv
from selenium import webdriver
from time import sleep
from parsel import Selector
from selenium.webdriver.common.keys import Keys
from collections import defaultdict
from selenium.webdriver.support.select import Select
####### reading from the input file ##########
columns = defaultdict(list) # each value in each column is appended to a list
# get the list of keywords from the csv file
with open('query.csv', 'r') as csvfile:
reader = csv.DictReader(csvfile) # read rows into a dictionary format
for row in reader: # read a row as {column1: value1, column2: value2,...}
for (k, v) in row.items(): # go over each column name and value
columns[k].append(v) # append the value into the appropriate list
# the list containing all of the keywords
search_query_list = columns['Keyword']
########## start scraping ###############
rb_results = []
# create a driver and let it open google chrome
driver = webdriver.Chrome("chromedriver")
# get website
driver.get('https://www.redbubble.com/')
sleep(0.5)
for i in range(len(search_query_list)):
next_query = search_query_list[i]
# get RB website
driver.get('https://www.redbubble.com/')
# get the search by its id
search_bar = driver.find_element_by_name("query")
sleep(0.5)
# enter the query to the search bar
search_bar.send_keys(next_query)
# press enter
search_bar.send_keys(Keys.RETURN)
sleep(1)
# from parsel's selector get the page source
sel1 = Selector(text=driver.page_source)
sleep(0.5)
# get first shirt //
continue_link = driver.find_element_by_class_name('shared-components-ShopSearchSkeleton-ShopSearchSkeleton__composedComponentWrapper--1s_CI').click()
sleep(1)
sel2 = Selector(text=driver.page_source)
sleep(0.5)
################## get TAGS ###############
# Check tags for all products
try:
# get the tags for the search query
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").extract_first()
tags_rb = str(tags_rb)
# if number of products is found print it and search for the prime
# print the number of products found
if tags_rb == None:
rb_results.append("0")
else:
#rb_results = str(tags_rb)
rb_results.append(tags_rb)
except ValueError:
pass
#except:
#rb_results.append("errore")
###### writing part ########
with open ("rb_results.csv","w", newline='') as resultFile:
writer = csv.DictWriter(resultFile, fieldnames=["Rb Results"],delimiter='\t')
writer.writeheader()
writer.writerows({'Rb Results': item} for item in rb_results)
resultFile.close()
Any ideas about how to fix it and extract the text of shared-components-Tags-Tags__listContent--oLdDf ? Many thanks!!!

If I right understand, you want an element text. So you can do it like this:
replace:
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").extract_first()
with:
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").text
You are getting this error:
'WebElement' object has no attribute 'extract_first'.
because WebElement does not have method .extract_first().
PS: you don't need this:
tags_rb = str(tags_rb)
The code block to replace is:
# Check tags for all products
try:
# get the tags for the search query
tags_rb = driver.find_element_by_class_name("shared-components-Tags-Tags__listContent--oLdDf").text # get text
# tags_rb = str(tags_rb) no need in this line
# if number of products is found print it and search for the prime
# print the number of products found
if tags_rb == None:
rb_results.append("0")
else:
#rb_results = str(tags_rb)
rb_results.append(tags_rb)
except ValueError:
pass

Related

Add Image/thumbnail in my python dataframe

I am working on a project where I need to create a movie database.
I have created my database and imported the links from IMDB that redirect you to the webpage. I would like to add also, the main image/thumbnail of each movie so that I can use then the csv in Power BI.
However, I did not manage to do it:
I have tried this:
import requests
from bs4 import BeautifulSoup
import numpy as np
images = []
for i in df_database_url['Url Film']:
r = requests.get(i)
soup = BeautifulSoup(r.content, "html.parser")
images.append(image_url)
But my goal is to have a column that includes the thumbnail for each movie.
Assuming that i is an imdb movie url (the kind that starts with https://www.imdb.com/title), you can target the script tag that seems to contain a lot of the main information for the movie - you can get that with
# import json
image_url = json.loads(soup.select_one('script[type="application/ld+json"]').text)['image']
or, if we're more cautious:
# import json
scCont = [s.text for s in soup.select('script[type="application/ld+json"]') if '"image"' in s.text]
if scCont:
try:
scCont = json.loads(scCont[0])
if 'image' not in scCont:
image_url = None
print('No image found for', i)
else: image_url = scCont['image']
except Exception as e:
image_url = None
print('Could not parse movie info for', i, '\n', str(e))
else:
image_url = None
print('Could not find script with movie info for', i)
(and you can get the trailer thumbnail with scCont['trailer']['thumbnailUrl'])
This way, instead of raising an error if anything on the path to the expected info is unavailable, it will just add image_url as None; if you want it to halt and raise error in such cases, use the first version.
and then after the loop you can add in the column with something like
df_database_url['image_urls'] = images
(you probably know that...)

Why export to CSV only returns URLs but not text with Pandas dataframe?

I'm trying to get the Urls (lnk) and the paragraphs (txt) extract form python script below into a csv with pandas.
For some reason the generated csv returns the headers (lnk and txt) and the Urls only, but not the corresponding paragraphs.
The CSV file returns currently that
lnk | txt
url 1 |
url 2 |
What I need would be
lnk | txt
url 1 | text 1
url 2 | text 2
But both the Urls and the paragraphs do get printed in the cmd console.
Why doesn't the paragraphs get exported into the csv as well?
What would be a working fix to this problem? thanks.
(sorry for the long code, I'm new to Python)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
url_txt = []
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#remove the text from this class
soup.find('p', {"class":"tiny-off"}).decompose()
#remove the text from this div id
soup.find('div', id = 'action-bar-top').decompose()
#remove the text from this div id
soup.find('div', id = 'main-content-sidebar').decompose()
#remove the text from this class
soup.find('div', {"class":"legal"}).decompose()
#get the 1st paragraph (which is a link)
for p in soup.find_all('p')[0]:
lnk = p.get_text()
print(lnk)
#remove the 1st paragraph (the link) from the following combined paragraphs
soup.find('p', id = 'current_url').decompose()
#extract all paragraphs save the 1st (the link)
for p in soup.find_all('p'):
txt = p.get_text().replace("\r", "").replace("\n", "")
print(txt)
# Compiling the info
lnktxt_data = [lnk, txt]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'txt'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)
I've found a simpler way that's working (with room for improvement),
with help from those two previous answers
How to join newlines into a paragraph in python
How to scrape web news and combine paragraphs into each article
Please let me know below how you would improve it if you find a better way.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
#GET TEXT
def getPageText(url):
# given a url, get page content
data = urlopen(url).read()
# parse as html structured document
soup = BeautifulSoup(data, 'html.parser')
# kill javascript content
for s in soup(["script", "style"]):
s.replaceWith('')
#
for p in soup.find_all('p')[1]:
lnk = p.get_text()
print(lnk)
#
# find body and extract text
p = soup.find("div", attrs={'class': 'article-content retro-folders'})
p.append(p.get_text())
x = p.text
y = x.replace("\r", "").replace("\n", "")
print(y)
# Compiling the info
lnktxt_data = [lnk, y]
# Append the info to the complete dataset
url_txt.append(lnktxt_data)
url_txt = []
#Get text from multiple urls
def main():
urls = [
'https://stackoverflow.com/questions/63400153/how-to-export-pandas-dataframe-into-csv-file', #dummy page
'https://stackoverflow.com/questions/52716762/how-to-join-newlines-into-a-paragraph-in-python' #dummy page
]
txt = [getPageText(url) for url in urls]
for t in txt:
print(t)
if __name__=="__main__":
main()
#FRAME DATA
# Making the dataframe
url_txt = pd.DataFrame(url_txt, columns = ['lnk', 'y'])
url_txt.head()
#CREATE A FILE
# Save as CSV File
url_txt.to_csv('url_txt.csv',index=False)

How can i catch any error caused by unset module variable?

So I have this function list and showlist but i need to catch any error caused by an unset module variable. can someone help me whats the first thing i should do?
this is my code at the moment
import csv
filep #filepath
menulist = []
def list():
"""Function to read the csv file, create a nested list
and return the list that is sorted based on calories
in the ascending order."""
global menulist
menulist = [] #store items
with open(filep) as csv_file: #read file
reader = csv.reader (csv_file, delimiter=',')
next(reader, None)
for row in reader:
row[2] = int(row[2].strip())
row[1] = float(row[1].strip())
if row[2]> 100 and row[2] <200:
menulist.append(row)
menulist.sort(key=lambda x: x[-1])
def show_list():#Function to display menu
global menulist
for i in range(len(menulist)):
print ('%-4d%-20s%-15s%-15s' %(i + 1, menulist[i][0], menulist[i][2], menulist[i][1]))
list()
show_list()
for example, if the variable file is not set before the list() is called, the function needs to catch an error and prints an appropriate comment
You are using a built-in function name as your function name. Which is not considered as a good practice in Python. It replaces the built-in function list() which is used to create a list. And you need to define a variable before you can use it.
Here's how you can catch the error and print an appropriate comment, with the variable defined:
import csv
filep = str(input("Enter the file name with path : ")) # filepath
menulist = []
def calorieList():
"""Function to read the csv file, create a nested list
and return the list that is sorted based on calories
in the ascending order."""
global menulist
menulist = [] # store items
with open(filep) as csv_file: # read file
reader = csv.reader(csv_file, delimiter=",")
next(reader, None)
for row in reader:
row[2] = int(row[2].strip())
row[1] = float(row[1].strip())
if row[2] > 100 and row[2] < 200:
menulist.append(row)
menulist.sort(key=lambda x: x[-1])
def show_list(): # Function to display menu
global menulist
for i in range(len(menulist)):
print("%-4d%-20s%-15s%-15s" % (i + 1, menulist[i][0], menulist[i][2], menulist[i][1]))
try:
calorieList()
except FileNotFoundError:
print("Enter a valid file path.")
show_list()
I assume you meant filep by the file variable.
If you try to access filep before it is set, your program should raise NameError: name 'filep' is not defined.
But if you want to raise a custom error message, you can just use a try-except block as follows:
import csv
filep #filepath
menulist = []
def list():
"""Function to read the csv file, create a nested list
and return the list that is sorted based on calories
in the ascending order."""
global menulist
menulist = [] #store items
try:
with open(filep) as csv_file: #read file
reader = csv.reader (csv_file, delimiter=',')
next(reader, None)
for row in reader:
row[2] = int(row[2].strip())
row[1] = float(row[1].strip())
if row[2]> 100 and row[2] <200:
menulist.append(row)
except NameError:
raise ValueError("Your custom message here")
menulist.sort(key=lambda x: x[-1])
def show_list():#Function to display menu
global menulist
for i in range(len(menulist)):
print ('%-4d%-20s%-15s%-15s' %(i + 1, menulist[i][0], menulist[i][2], menulist[i][]))
list()
show_list()

How to determine the number of columns per row variably in a CSV File with Python?

I am analyzing xml-structured Textfiles about insider dealings. I wrote some code to parse through the XML-structure and write my output in a CSV file. The results of the files are written per line and the analyzed information is written in individual columns. But in some files information is present in multiple times and my code override the information in the cells, in the end only one date is in the cell of my CSV-File.
import csv
import glob
import re
import string
import time
import bs4 as bs
# User defined directory for files to be parsed
TARGET_FILES = r'D:\files\'
# User defined file pointer to LM dictionary
# User defined output file
OUTPUT_FILE = r'D:\ouput\Parser.csv'
# Setup output
OUTPUT_FIELDS = [r'Datei', 'transactionDate', r'transactionsCode', r'Director', r'Officer', r'Titel', r'10-% Eigner', r'sonstiges', r'SignatureDate']
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n', delimiter=';')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
soup = bs.BeautifulSoup(f_in, 'xml')
output_data = get_data(soup)
output_data[0] = file
wr.writerow(output_data)
def get_data(soup):
# overrides the transactionDate if more than one transactions disclosed on the current form
# the number determine the column for the output
_odata = [0] * 9
try:
for item in soup.find_all('transactionDate'):
_odata[1] = item.find('value').text
except AttributeError:
_odata[1] = ('keine Angabe')
try:
for item in soup.find_all('transactionAcquiredDisposedCode'):
_odata[2] = item.find('value').text
except AttributeError:
_odata[2] = 'ka'
for item in soup.find_all('reportingOwnerRelationship'):
try:
_odata[3] = item.find('isDirector').text
except AttributeError:
_odata[3] = ('ka')
try:
_odata[4] = item.find('isOfficer').text
except AttributeError:
_odata[4] = ('ka')
try:
_odata[5] = item.find('officerTitle').text
except AttributeError:
_odata[5] = 'ka'
try:
_odata[6] = item.find('isTenPercentOwner').text
except AttributeError:
_odata[6] = ('ka')
try:
_odata[7] = item.find('isOther').text
except AttributeError:
_odata[7] = ('ka')
try:
for item in soup.find_all('ownerSignature'):
_odata[8] = item.find('signatureDate').text
except AttributeError:
_odata[8] = ('ka')
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nGeneric_Parser.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
Actually the code works, but overwrites columns if, for e.g. more than one transacion date is given in the file. So I need a code that automatically uses the next column for each transaction date. How could this work?
I would be glad if someone have a solution for my problem. Thanks a lot!
Your issue is that you are iterating over the result of
soup.find_all()
and every time you are writing to the same value. You need to do something with
_odata in each iteration, otherwise you will only end up with whatever is written to it the last time.
If you can show us what the data you're trying to parse actually looks like, perhaps we could give a more specific answer.

How to get maxresults for search from Youtube Data API v3?

I want to get as many results as possible for a particular Youtube search query. However, the maximum no. of results that can be retrieved is 50. I know that nextPageToken can be used to retrieve results of next page. How do modify the python code to achieve the same?
#!/usr/bin/python
# original source example: https://developers.google.com/youtube/v3/docs/search/list
# assumes use of Python 3
# This sample executes a search request for the specified search term.
# Sample usage:
# python search.py --q=surfing --max-results=10
# NOTE: To use the sample, you must provide a developer key obtained
# in the Google APIs Console. Search for "REPLACE_ME" in this code
# to find the correct place to provide that key..
import argparse
# library googleapiclient installed with: pip install --upgrade google-api-python-client
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
# Set DEVELOPER_KEY to the API key value from the APIs & auth > Registered apps
# tab of
# https://cloud.google.com/console
# Please ensure that you have enabled the YouTube Data API for your project.
DEVELOPER_KEY = 'KEY'
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'
def youtube_search(query_term, max_results):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
# Call the search.list method to retrieve results matching the specified
# query term.
search_response = youtube.search().list(
q=query_term,
part='id,snippet',
type='video',
relevanceLanguage='en',
maxResults=max_results
).execute()
video_ids = []
# Add each result to the appropriate list, and then display the lists of
# matching videos, channels, and playlists.
for search_result in search_response.get('items', []):
video_ids.append(search_result['id']['videoId'])
return video_ids
if __name__ == '__main__':
url_prefix = 'https://www.youtube.com/watch?v='
query_terms = '"my_query"'
max_results = 50
try:
ids = youtube_search(query_terms, max_results)
except HttpError as e:
print('An HTTP error %d occurred:\n%s' % (e.resp.status, e.content))
else:
with open('output.txt', 'w') as f:
for i in ids:
f.write(url_prefix+i+"\n")
Here's what that needs to be added to keep fetching results until no nextPageToken is found.
nextPageToken = search_response.get('nextPageToken')
while ('nextPageToken' in search_response):
nextPage = youtube.search().list(
q=query_term,
part='id,snippet',
type='video',
relevanceLanguage='en',
maxResults=max_results,
pageToken=nextPageToken
).execute()
search_response['items'] = search_response['items'] + nextPage['items']
if 'nextPageToken' not in nextPage:
search_response.pop('nextPageToken', None)
else:
nextPageToken = nextPage['nextPageToken']

Resources