Showing two différents errors with the same code that I used to scrape other pages - python-3.x

I used a code to scrape two pages from tripadvisor, and it worked very well. But now, it shows me two differents errors :
with open("iletaitunsquare1.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(["inf_rest_name", "rest_eclf", "name_client", "date_rev_cli", "opinion_cl"])
with requests. Session() as s:
for offset in range (270,1230,10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d6575305-Reviews-or{offset}-Il_Etait_Un_Square-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
reviews = soup.select('.reviewSelector')
ids = [review.get('data.reviewid') for review in reviews]
r = s.post(
'https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=',
data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'Referer': r.url}
)
soup = bs(r.content, 'lxml')
if not offset:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
for review in soup.select('.reviewSelector'):
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}", f"{titre_rev_cl}", f"{opinion_cl}"]
w.writerow(row)
Error on excecution:
"data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'}
TypeError: sequence item 0: expected str instance, NoneType found"
and after I decided to change just values in line 6(pages of site) and 7 (Url):
with open("boutary.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(["inf_rest_name", "rest_eclf", "name_client", "date_rev_cl", "titre_rev_cl", "opinion_cl"])
with requests.Session() as s:
for offset in range(40, 290, 10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d9783452-Reviews-or{offset}-Boutary-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
reviews = soup.select('.reviewSelector')
ids = [review.get('data-reviewid') for review in reviews]
r = s.post(
'https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=',
data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'referer': r.url}
)
soup = bs(r.content, 'lxml')
if not offset:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
for review in soup.select('.reviewSelector'):
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
w.writerow(row)
and it shows me
"row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}",
f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
NameError: name 'inf_rest_name' is not defined"
These errors are strange because, before, I used the same code with other URL and it worked perfectly.
Can you tell me please what is happening? How can I run it proprerly? I will appreciate your help.

This is because in the original code, not posted here, it was relying on Truthy/Falsy value of offset 0 which in your prior question was the first offset.
For example, with:
for offset in range(0, 10, 10):
if not offset:
The first value 0 is a Falsy versus numbers > 0 (in this scenario) which will be seen as Truthy. If not True i.e. False i.e. if 0 offset then set the value of inf_rest_name. This ensures its value it only set on the first loop rather than each time. Its value doesn't change so no need to read again.
With the following all values are Truthies and so inf_rest_name never gets set.
for offset in range(40, 290, 10):
if not offset:
You could change to:
if offset == firstvalue:
e.g.
if offset == 40:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
See this for more info.
Those lines also need to work with first soup not later soup (as that is only reviews)
import requests
from bs4 import BeautifulSoup as bs
with requests.Session() as s:
for offset in range(40, 290, 10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d9783452-Reviews-or{offset}-Boutary-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
if offset == 40:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
reviews = soup.select('.reviewSelector')
ids = [review.get('data-reviewid') for review in reviews]
r = s.post(
'https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=',
data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'referer': r.url}
)
soup = bs(r.content, 'lxml')
for review in soup.select('.reviewSelector'):
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
For your first code block you are using an invalid attribute. It should be
ids = [review.get('data-reviewid') for review in reviews]
Note I have added an is None test to handle not found. This should be added to top version as well.
import requests
from bs4 import BeautifulSoup as bs
with requests. Session() as s:
for offset in range (270, 1230, 10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d6575305-Reviews-or{offset}-Il_Etait_Un_Square-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
if offset == 270:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
reviews = soup.select('.reviewSelector')
ids = [review.get('data-reviewid') for review in reviews]
r = s.post(
'https://www.tripadvisor.fr/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=',
data = {'reviews': ','.join(ids), 'contextChoice': 'DETAIL'},
headers = {'Referer': r.url}
)
soup = bs(r.content, 'lxml')
for review in soup.select('.reviewSelector'):
name_client= review.select_one('.info_text > div:first-child')
if name_client is None:
name_client = 'N/A'
else:
name_client = name_client.text.strip()
date_rev_cl = review.select_one('.ratingDate')
if date_rev_cl is None:
date_rev_cl = 'N/A'
else:
date_rev_cl = date_rev_cl['title'].strip()
titre_rev_cl = review.select_one('.noQuotes')
if titre_rev_cl is None:
titre_rev_cl = 'N/A'
else:
titre_rev_cl = titre_rev_cl.text.strip()
opinion_cl = review.select_one('.partial_entry')
if opinion_cl is None:
opinion_cl = 'N/A'
else:
opinion_cl = opinion_cl.text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}", f"{titre_rev_cl}", f"{opinion_cl}"]
print(row)

Related

How to get the google search console data using access_token or refresh token in python?

I'm trying to get the data from google search console in behalf of the user once they login it returns the access_token and refresh_token by using the access_token or refresh_token how to get the Google Search Console data (imperssion,click,pages).
Same way i am getting the data from Google Analytics but in google search console it's not possible.
def extract_data(site, creds, num_days, output):
domain_name = get_domain_name(site)
create_project(domain_name)
full_path = domain_name + '/' + output
current_dates = get_dates_from_csv(full_path)
webmasters_service = authorize_creds(creds)
# Set up Dates
end_date = datetime.date.today() - relativedelta.relativedelta(days=3)
start_date = end_date - relativedelta.relativedelta(days=num_days)
delta = datetime.timedelta(days=1) # This will let us loop one day at the time
scDict = defaultdict(list)
while start_date <= end_date:
if current_dates is not None and current_dates.str.contains(
datetime.datetime.strftime(start_date, '%Y-%m-%d')).any():
start_date += delta
else:
# print('Start date at beginning: %s' % start_date)
maxRows = 25000 # Maximum 25K per call
numRows = 0 # Start at Row Zero
status = '' # Initialize status of extraction
# print("status status status status",status)
while (status != 'Finished'): # Test with i < 10 just to see how long the task will take to process.
request = {
'startDate': datetime.datetime.strftime(start_date, '%Y-%m-%d'),
'endDate': datetime.datetime.strftime(start_date, '%Y-%m-%d'),
'dimensions': ['date', 'page', 'query'],
'rowLimit': maxRows,
'startRow': numRows
}
response = execute_request(webmasters_service, site, request)
try:
# Process the response
for row in response['rows']:
scDict['date'].append(row['keys'][0] or 0)
scDict['page'].append(row['keys'][1] or 0)
scDict['query'].append(row['keys'][2] or 0)
scDict['clicks'].append(row['clicks'] or 0)
scDict['ctr'].append(row['ctr'] or 0)
scDict['impressions'].append(row['impressions'] or 0)
scDict['position'].append(row['position'] or 0)
# print('successful at %i' % numRows)
except:
print('error occurred at %i' % numRows)
# Add response to dataframe
df = pd.DataFrame(data=scDict)
df['clicks'] = df['clicks'].astype('int')
df['ctr'] = df['ctr'] * 100
df['impressions'] = df['impressions'].astype('int')
df['position'] = df['position'].round(2)
print('Numrows at the start of loop: %i' % numRows)
try:
numRows = numRows + len(response['rows'])
except:
status = 'Finished'
print('Numrows at the end of loop: %i' % numRows)
if numRows % maxRows != 0:
status = 'Finished'
start_date += delta
print('Start date at end: %s' % start_date)
write_to_csv(df, full_path)
return df
This is code i am getting in google search console this code using the webmasters_service = authorize_creds(creds) method but i want to access using access_token or refresh token.
This is the code used in google analytics.
def google_analytics_reporting_api_data_extraction(viewID, dim, met, start_date,
end_date, refresh_token,
transaction_type, goal_number,
condition):
viewID = viewID;
dim = dim;
met = met;
start_date = start_date;
end_date = end_date;
refresh_token = refresh_token;
transaction_type = transaction_type;
condition = condition
goal_number = goal_number
viewID = "".join(['ga%3A', viewID])
if transaction_type == "Goal":
met1 = "%2C".join([re.sub(":", "%3A", i) for i in met]).replace("XX", str(goal_number))
elif transaction_type == "Transaction":
met1 = "%2C".join([re.sub(":", "%3A", i) for i in met])
dim1 = "%2C".join([re.sub(":", "%3A", i) for i in dim])
credentials = client.OAuth2Credentials(
access_token=None, client_id=client_id, client_secret=client_secret, refresh_token=refresh_token,
token_expiry=3600, token_uri=GOOGLE_TOKEN_URI, user_agent='my-user-agent/1.0', revoke_uri=GOOGLE_REVOKE_URI)
credentials.refresh(httplib2.Http())
rt = (json.loads(credentials.to_json()))['access_token']
api_url = "https://www.googleapis.com/analytics/v3/data/ga?ids="
url = "".join(
[api_url, viewID, '&start-date=', start_date, '&end-date=', end_date, '&metrics=', met1, '&dimensions=',
dim1, '&max-results=1000000', condition, '&access_token=', rt])
data = pd.DataFrame()
dataa = pd.DataFrame()
users = []
final_date = []
# try:
r = requests.get(url)
# print("r values",list((r.json())['rows']))
# print("start_date",start_date)
start = datetime.datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.datetime.strptime(end_date, "%Y-%m-%d")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days)]
for each in date_generated:
date_value = each.date()
url = "".join(
[api_url, viewID, '&start-date=', str(each.date()), '&end-date=', str(each.date()), '&metrics=', met1,
'&dimensions=',
dim1, '&max-results=1000000', condition, '&access_token=', rt])
rr = requests.get(url)
dataa = pd.DataFrame(list((rr.json())['rows']))
users.append(dataa[0][0])
final_date.append(str(date_value))
# print("data and users", users, final_date)
data = pd.DataFrame(list((r.json())['rows']))
try:
data = pd.DataFrame(list((r.json())['rows']), columns=[re.sub("ga:", "", i) for i in met])
# data['date'] = start_date
# dim_data = pd.DataFrame(list((r.json())['rows']), columns=[re.sub("ga:", "", i) for i in dim])
return data, users, final_date
except:
print((r.json()))
In the above code by using refresh_token we access the data from google analytics. Like this way only i want the code in google search console.
Please help me out

Null to Blank values JSON sending in embed

I'm making a command that will spit out a random drink to make. The problem is, some drinks have more ingredients and measurements than others which causes some of the keys to come back as null or None. I'm using fix_data to turn the Nones into blanks but it causes a huge gap in my embed. Is there any easier way to either display the data in the embed or an easier way to clear the None values in the JSON?
def fix_data(data):
if type(data) is list:
for i, e in enumerate(data):
if e is None:
data[i] = ''
else:
fix_data(e)
if message.content.lower().startswith("!drink"):
url = 'https://thecocktaildb.com/api/json/v1/1/random.php'
res = requests.get(url)
data = res.json()
fix_data(data)
drinkname = data['drinks'][0]['strDrink']
drinkimage = data['drinks'][0]['strDrinkThumb']
instructions = data['drinks'][0]['strInstructions']
ingredient1 = data['drinks'][0]['strIngredient1']
ingredient2 = data['drinks'][0]['strIngredient2']
ingredient3 = data['drinks'][0]['strIngredient3']
ingredient4 = data['drinks'][0]['strIngredient4']
ingredient5 = data['drinks'][0]['strIngredient5']
ingredient6 = data['drinks'][0]['strIngredient6']
ingredient7 = data['drinks'][0]['strIngredient7']
ingredient8 = data['drinks'][0]['strIngredient8']
ingredient9 = data['drinks'][0]['strIngredient9']
ingredient10 = data['drinks'][0]['strIngredient10']
ingredient11 = data['drinks'][0]['strIngredient11']
ingredient12 = data['drinks'][0]['strIngredient12']
ingredient13 = data['drinks'][0]['strIngredient13']
ingredient14 = data['drinks'][0]['strIngredient14']
ingredient15 = data['drinks'][0]['strIngredient15']
measure1 = data['drinks'][0]['strMeasure1']
measure2 = data['drinks'][0]['strMeasure2']
measure3 = data['drinks'][0]['strMeasure3']
measure4 = data['drinks'][0]['strMeasure4']
measure5 = data['drinks'][0]['strMeasure5']
measure6 = data['drinks'][0]['strMeasure6']
measure7 = data['drinks'][0]['strMeasure7']
measure8 = data['drinks'][0]['strMeasure8']
measure9 = data['drinks'][0]['strMeasure9']
measure10 = data['drinks'][0]['strMeasure10']
measure11 = data['drinks'][0]['strMeasure11']
measure12 = data['drinks'][0]['strMeasure12']
measure13 = data['drinks'][0]['strMeasure13']
measure14 = data['drinks'][0]['strMeasure14']
measure15 = data['drinks'][0]['strMeasure15']
msg = '**Drink**: \n{}\n\n**Ingredients**: \n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n{} {}\n\n**Instructions**: \n{}'.format(drinkname,measure1, ingredient1, measure2, ingredient2, measure3, ingredient3, measure4, ingredient4, measure5, ingredient5, measure6, ingredient6, measure7, ingredient7, measure8, ingredient8, measure9, ingredient9, measure10, ingredient10, measure11, ingredient11, measure12, ingredient12, measure13,ingredient13, measure14, ingredient14 , measure15, ingredient15, instructions)
embed = discord.Embed(title="**🍸 DRINK BOT 🍸**", description=msg, color=0x9554af)
embed.set_author(name="DRINK BOT", icon_url="https://i.imgur.com/RI2iiLd.png")
embed.set_thumbnail(url='{}'.format(drinkimage))
embed.set_footer(text="DRINK BOT", icon_url="https://i.imgur.com/RI2iiLd.png")
await client.send_message(message.channel, embed=embed)
Use for-loop and if-statement to check every ingredient if None.
import requests
url = 'https://thecocktaildb.com/api/json/v1/1/random.php'
res = requests.get(url)
data = res.json()
drinks = data['drinks'][0]
ingredients_list = ""
for i in range(1, 16):
if drinks[f"strIngredient{i}"]:
measure = drinks[f"strMeasure{i}"]
ingredient = drinks[f"strIngredient{i}"]
ingredients_list+=f"{measure if measure else ''} {ingredient}\n"
drinkname = drinks['strDrink']
drinkimage = drinks['strDrinkThumb']
instructions = drinks['strInstructions']
msg = f"**Drink**:\n {drinkname}\n\n **Ingredients**:\n {ingredients_list}\n **Instructions**: \n{instructions}"
print(msg)

Webscraping doesn't fill up the file with all information asked from all pages

I'm beginnerin python and I need to scrape restaurants name, socioeconomical status, name clients, review date, titre review and review from 10 to 40 pages of only one restaurant(python3.7 and beautiful soup). But when I open the csv file, I have all information about only first reviewer. This is my code :
csv_file = open("lebouclard.csv", "w", encoding="utf-8")
csv_writer = csv.writer(csv_file, delimiter = ";")
csv_writer.writerow(["inf_rest_name", "rest_eclf", "name_client", "date_rev_cl", "titre_rev_cl", "opinion_cl"])
for i in range(10,40):
url = requests.get("https://www.tripadvisor.fr/Restaurant_Review-g187147-d947475-Reviews-or10-Le_Bouclard-Paris_Ile_de_France.html".format(i)).text
page_soup = soup(url, "html.parser")
gen_rest = page_soup.find_all("div", {"class":"page"})
for rest in gen_rest:
rname= rest.find("h1",{"class":"ui_header h1"})
inf_rest_name = rname.text
print("inf_rest_name: " + inf_rest_name)
econ_class_food = rest.find("div", {"class":"header_links"})
rest_eclf = econ_class_food.text.strip()
print("rest_eclf: " + rest_eclf)
for clients in gen_rest:
client_info = clients.find_all("div", {"class":"info_text"})
name_client = client_info[0].text
print("name_client: " + name_client)
date_review = clients.find_all("span", {"class":"ratingDate"})
date_rev_cl = date_review[0].text.strip()
print("date_rev_cl: " + date_rev_cl)
titre_review = clients.find_all("span", {"class":"noQuotes"})
titre_rev_cl = titre_review[0].text.strip()
print("titre_rev_cl: " + titre_rev_cl)
opinion = clients.find_all("p", {"class":"partial_entry"})
opinion_cl = opinion[0].text.replace("\n","")
print("opinion_cl: " + opinion_cl)
csv_writer.writerow([inf_rest_name, rest_eclf, name_client, date_rev_cl, titre_rev_cl, opinion_cl])
csv_file.close()
I tried to eliminate the for client in gen_rest and put:
client_info = rest.find_all("div", {"class":"info_text"})
name_client = client_info[0].text
print("name_client: " + name_client)
date_review = rest.find_all("span", {"class":"ratingDate"})
date_rev_cl = date_review[0].text.strip()
print("date_rev_cl: " + date_rev_cl)
titre_review = rest.find_all("span", {"class":"noQuotes"})
titre_rev_cl = titre_review[0].text.strip()
print("titre_rev_cl: " + titre_rev_cl)
opinion = rest.find_all("p", {"class":"partial_entry"})
opinion_cl = opinion[0].text.replace("\n","")
print("opinion_cl: " + opinion_cl)
but it shows me the same information in the scv file. After I decided to eliminate the find_all and the [0] but The results were the same. What am I missing?... I have read the others questions about this but I don't find my error.
Try the following where f string is used so a value for next set of reviews is passed into the string during loop
import requests, csv
from bs4 import BeautifulSoup as bs
with open("lebouclard.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ";", quoting=csv.QUOTE_MINIMAL)
w.writerow(["inf_rest_name", "rest_eclf", "name_client", "date_rev_cl", "titre_rev_cl", "opinion_cl"])
with requests.Session() as s:
for offset in range(0,40,10):
url = f'https://www.tripadvisor.fr/Restaurant_Review-g187147-d947475-Reviews-or{offset}-Le_Bouclard-Paris_Ile_de_France.html'
r = s.get(url)
soup = bs(r.content, 'lxml')
if not offset:
inf_rest_name = soup.select_one('.heading').text.replace("\n","").strip()
rest_eclf = soup.select_one('.header_links a').text.strip()
for review in soup.select('.reviewSelector'):
name_client = review.select_one('.info_text > div:first-child').text.strip()
date_rev_cl = review.select_one('.ratingDate')['title'].strip()
titre_rev_cl = review.select_one('.noQuotes').text.strip()
opinion_cl = review.select_one('.partial_entry').text.replace("\n","").strip()
row = [f"{inf_rest_name}", f"{rest_eclf}", f"{name_client}", f"{date_rev_cl}" , f"{titre_rev_cl}", f"{opinion_cl}"]
w.writerow(row)
For my settings, in order for this to work properly, I have to set delimiter as "," not ";"
Sample of results:

scrape product specification from amazon using scrapy

Hello i want to scrape product specification table available on the product page of the link : https://www.amazon.com/dp/B07HJ41HCF for which i have written the following spider in scrapy.
def parse(self, response):
item = GraingerItem()
item['url'] = response.url
item['proddescription'] = response.xpath('//*[#id="productDetails_detailBullets_sections1"]/td[1]/th/text()').extract()
item['title'] = response.xpath('//*[#id="productTitle"]/text()').extract()[0].strip()
try:
item['sellername'] = response.xpath('//*[#id="bylineInfo"]/text()').extract()[0].strip()
except IndexError:
item['sellername'] = "No Seller Name"
gg=[]
cc= response.xpath('//*[#class="a-link-normal a-color-tertiary"]')
for bb in cc:
dd=bb.xpath('text()').extract()[0].strip()
gg.append(dd)
gg.append(">")
qq=str(gg)
qr=qq.replace("'","")
qs = qr.replace(">]","")
qt=qs.replace("[","")
qu = qt.replace(",","")
item['travlink'] = qu
try:
item['rating'] = response.xpath('//*[#id="acrPopover"]/span[1]/a/i[1]/span/text()').extract()[0].strip()
except IndexError:
item['rating'] = "Be the First one to review"
try:
item['Crreview'] = response.xpath('//*[#id="acrCustomerReviewText"]/text()').extract()[0].strip()
except IndexError:
item['Crreview'] = "Be the First one to review"
dd = response.xpath('//*[#id="feature-bullets"]/ul')
ft = []
for i in range(2,40):
q = str(i)
trows ="li["+q+"]"
xpathgiven = trows + "/span/text()"
for bullets in dd:
b1= bullets.xpath(xpathgiven).extract()
for ac in b1:
ab = ac.replace("\xa0", "")
ft.append(b1)
ft.append(";")
stft = str(ft)
stft1 = stft.replace("';', [], ';'","")
stft2 = stft1.replace("\\t","")
stft3 = stft2.replace('\\n',"")
stft4 = stft3.replace("'","")
stft5 = stft4.replace("[","")
stft6 = stft5.replace("]","")
stft7 = stft6.replace(",","")
item['feature'] = stft7
description = []
try:
for i in range(1, 100):
q1 = str(i)
trows1 = "[" + q1 + "]"
xpathgiven1 = "//*[#id='productDescription']/p/text()["+q1+"]"
gg = response.xpath(xpathgiven1).extract()
description.append(gg)
description.append(";")
stft = str(description)
dsft1 = stft.replace("';', [], ';'", "")
dsft2 = dsft1.replace("'], ';', ['", ";")
dsft3 = dsft2.replace('\\n', "")
dsft33 = dsft3.replace('\\t', "")
dsft4 = dsft33.replace("'", "")
dsft5 = dsft4.replace("[", "")
dsft6 = dsft5.replace("]", "")
dsft7 = dsft6.replace(",", "")
item['Description'] = dsft7
except IndexError:
item['Description'] = "No Description"
In the above code everything works fine but the item['proddescription'] does yield an empty list any help with the above will be highly appreciated
Worked for your variant:
response.xpath('//*[#id="productDetails_detailBullets_sections1"]/tr/*/text()').re('(\w+[^\n]+)')

Bokeh charts unresponsive on rangeslider on_change

I am working on bokeh charts for the first time. I have followed a few tutorials but due to some reason, update function is not working on rangeslider on_change()
def make_data(df, start, end):
#df['ID'] = range(1, len(df) + 1)
s = df['ID'] >= start
e = df['ID'] <= end
df1 = df[e & s]
date = df1['date'].tolist()
capi = df1['capi'].tolist()
data = {'x': dateTime(date), 'y': capi}
source = ColumnDataSource(data)
return source
def update(attr, old, new):
df = pd.DataFrame.from_csv("main_data.csv", index_col = None)
df['ID'] = range(1, len(df) + 1)
new_src = make_dataset(df, range_start = range_select.value[0], range_end = range_select.value[1])
source.data.update(new_src.data)
def make_plot(source):
p1 = figure(x_axis_type="datetime", title="Stock Closing Prices")
p1.grid.grid_line_alpha=0.3
p1.xaxis.axis_label = 'Date'
p1.yaxis.axis_label = 'Price'
p1.line('x', 'y', source = source, color='#A6CEE3', legend='capi')
return p1
range_select = RangeSlider(title="Date range", value=(ids[0], ids[100]), start=ids[0], end=ids[-1], step=1)
range_select.on_change('value', update)
source = make_data(df, 1, 1000)
p = make_plot(source)
controls = WidgetBox(range_select)
layout = column(controls, p)
tab = Panel(child=layout, title = 'Histogram')
tabs = Tabs(tabs = [tab])
show(tabs)
can someone please point me in the right direction here

Resources