Insert dynamic data to mysql with python - python-3.x

EDITED >>>>>
I write some code which return two outputs but an error appears.
What is the main problem of my code?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import os
import sys
import codecs
from urllib.request import urlopen
import pymysql
import mysql.connector
for i in range(1): #electronic
my_url = "https://www.xxxxx.com/mobile_phones/?facet_is_mpg_child=0&viewType=gridView&page="
uClient = uReq(my_url + str(i))
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div" , {"class" : "sku -gallery" })
for container in containers:
name = container.img["alt"]
title_container = container.findAll("span", {"class" : "brand"})
Brand = title_container[0].text
price = container.findAll("span",{"class" : "price"} )
price_one = price[0].text.strip()
price_old = container.findAll("span",{"class" : "price -old "})
price_two = '0'
if len(price_old) > 0:
price_two = price_old[0].text.strip()
rank = container.findAll("span",{"class" : "rating-aggregate"})
ranking = 'N/A'
if len(rank) > 0:
ranking = rank[0].text.strip()
conn = pymysql.connect(host="localhost",user="root",passwd="",db="prod")
x = conn.cursor()
#name1 = name()
#brand1 = Brand()
#price_one1 = price_one1()
#price_two1= price_one1()
#rank1 = rank()
x.execute("INSERT INTO list (productname,brand,price1,price2,rank) VALUES (%s,%s,%s,%s.%s)" , (name,Brand,price_one,price_two,ranking))
conn.commit()
conn.close()
C:\Users\xxxx\AppData\Local\Programs\Python\Python35\python.exe
C:/Users/xxxx/.PyCharm2018.2/config/scratches/bd.py Traceback (most
recent call last): File
"C:/Users/xxxx/.PyCharm2018.2/config/scratches/bd.py", line 54, in
x.execute("INSERT INTO list (productname,brand,price1,price2,rank) VALUES (%s,%s,%s,%s.%s)" , (name,Brand,price_one,price_two,ranking))
File
"C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\cursors.py",
line 170, in execute
result = self._query(query) File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\cursors.py",
line 328, in _query
conn.query(q) File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 516, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered) File
"C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 727, in _read_query_result
result.read() File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 1066, in read
first_packet = self.connection._read_packet() File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 683, in _read_packet
packet.check_error() File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\protocol.py",
line 220, in check_error
err.raise_mysql_exception(self._data) File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\err.py",
line 109, in raise_mysql_exception
raise errorclass(errno, errval) pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that
corresponds to your MariaDB server version for the right syntax to use
near '.'2')' at line 1")
Process finished with exit code 1

The problem is with the variable rank. You are supposed to pass ranking but you missed it somehow.
by the code you have given,
rank = container.findAll("span",{"class" : "rating-aggregate"}) # resultset
if len(rank) > 0:
ranking = rank[0].text.strip() #result
So the change is
x.execute("INSERT INTO list (productname,brand,price1,price2,rank) VALUES (%s,%s,%s,%s.%s)" , (name,Brand,price_one,price_two,ranking))
and you are ready to go! I have some suggestions for you. if you are using if condition always give an else condition or a default value for the variable getting declared within the conditional statement. Or else you might end up in error when the condition fails. Like,
rank = container.findAll("span",{"class" : "rating-aggregate"})
ranking = rank[0].text.strip() if len(rank) > 0 else 'N/A'
Or,
rank = container.findAll("span",{"class" : "rating-aggregate"})
ranking = 'N/A'
if len(rank) > 0:
ranking = rank[0].text.strip()
Cheers!

This code stores the information on the csv file, but now I need to save it to mysql.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import os
import sys
import unicodecsv as csv
import codecs
from urllib.request import urlopen
for i in range(3): #electronic
my_url = "https://www.xxxx.com/mobile_phones/?facet_is_mpg_child=0&viewType=gridView&page="
uClient = uReq(my_url + str(i))
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div" , {"class" : "sku -gallery" })
filename = "mobile.csv"
f = codecs.open(filename, "a" , "utf-8-sig")
headers = "name, Brand, price_one, price_two, ranking\n"
f.write(headers)
for container in containers:
name = container.img["alt"]
title_container = container.findAll("span", {"class" : "brand"})
Brand = title_container[0].text
price = container.findAll("span",{"class" : "price"} )
price_one = price[0].text.strip()
price_old = container.findAll("span",{"class" : "price -old "})
price_two = 0
if len(price_old) > 0:
price_two = price_old[0].text.strip()
rank = container.findAll("span",{"class" : "rating-aggregate"})
if len(rank) > 0:
ranking = rank[0].text.strip()
print("name " + name)
print("Brand "+ Brand)
print("price_one " + price_one)
print("price_two {}".format(price_two)) #---->
print("ranking " + ranking)
f.write(name + "," + Brand.replace(",", "|") + "," + price_one.replace(",", "") + "," + price_two.replace(",", "") + "," + ranking + "\n")
f.close()

Related

Python Program ValueError: invalid literal for int() with base 10:

I am a python-beginner, trying to write a program to: "Corona Virus Live Updates for India – Using Python".
I am getting this error, after running the code/programe:
performance.append(int(row[2]) + int(row[3]))
ValueError: invalid literal for int() with base 10:
What can I do to fix this problem?
The Code:
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
URL = 'https://www.mohfw.gov.in/'
SHORT_HEADERS = ['SNo', 'State','Indian-Confirmed',
'Foreign-Confirmed','Cured','Death']
response = requests.get(URL).content
soup = BeautifulSoup(response, 'html.parser')
header = extract_contents(soup.tr.find_all('th'))
stats = []
all_rows = soup.find_all('tr')
for row in all_rows:
stat = extract_contents(row.find_all('td'))
if stat:
if len(stat) == 5:
# last row
stat = ['', *stat]
stats.append(stat)
elif len(stat) == 6:
stats.append(stat)
stats[-1][1] = "Total Cases"
stats.remove(stats[-1])
#Step #3:
objects = []
for row in stats :
objects.append(row[1])
y_pos = np.arange(len(objects))
performance = []
for row in stats:
performance.append(int(row[2]) + int(row[3]))
table = tabulate(stats, headers=SHORT_HEADERS)
print(table)
just changed the line performance.append(int(row[2]) + int(row[3])) to performance.append(row[2] +str(int(float(row[3]))) )
Full Code:
import requests
from bs4 import BeautifulSoup
import numpy as np
from tabulate import tabulate
extract_contents = lambda row: [x.text.replace('\n', '') for x in row]
URL = 'https://www.mohfw.gov.in/'
SHORT_HEADERS = ['SNo', 'State','Indian-Confirmed', 'Foreign-Confirmed','Cured','Death']
response = requests.get(URL).content
soup = BeautifulSoup(response, 'html.parser')
header = extract_contents(soup.tr.find_all('th'))
stats = []
all_rows = soup.find_all('tr')
for row in all_rows:
stat = extract_contents(row.find_all('td'))
if stat:
if len(stat) == 5:
# last row
stat = ['', *stat]
stats.append(stat)
elif len(stat) == 6:
stats.append(stat)
stats[-1][1] = "Total Cases"
stats.remove(stats[-1])
#Step #3:
objects = [row[1] for row in stats ]
y_pos = np.arange(len(objects))
performance = [row[2] +str(int(float(row[3]))) for row in stats]
table = tabulate(stats, headers=SHORT_HEADERS)
print(table)

how to access bestbuy item price

I want to check the price of a item from bestbuy website, however, the access is denied. Does anyone have some advice how to access? Thanks!
My code:
import requests
import bs4 as bs
url = "https://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?skuId=6360611"
url_get = requests.get(url)
soup = bs.BeautifulSoup(url_get.content, 'lxml')
with open('url_bestbuy.txt', 'w', encoding='utf-8') as f_out:
f_out.write(soup.prettify())
js_test = soup.find('span', id ='priceblock_ourprice')
if js_test is None:
js_test = soup.find('span', id ='div.price-block')
str = ""
for line in js_test.stripped_strings :
str = line
# convert to integer
str = str.replace(", ", "")
str = str.replace("$", "")
current_price = int(float(str))
your_price = 2000
if current_price < your_price :
print("I can afford it")
else:
print("Price is high please wait for the best deal")
You don't have permission to access "http://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?" on this server.

How to scrape the data which is unable to inspect and which is inside the <svg> tag

I'm unable to scrape some of the data from a webpage Partywise Result. I want to scrape the partwise{vote%,vote count} from that page.
The code I have tried so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
for records2 in soup2.findAll("div",{"id":"piecharts26"}):
print(records2.table)
for records in records2.findAll("table"):
print(records)
edata = ""
for data in records.findAll('td'):
edata= edata+","+data.text
edatas= edatas + "\n" + edata[1:]+","+code
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
The result which I am expecting from is the %vote share
I want the output to be in CSV format like this:
INC,43.0%,6144192
and so on fully from the page one
and two
There is a loading of the data directly from the javascript inside your div:
if(document.getElementById('piecharts26')!=null)
So you have to use a console browser, such as selenium (link here) , or use a regex:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
print(result)
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
OUTPUT:
[['INC {43.0%,6144192}', 6144192],
['BJP {33.0%,4707141}', 4707141],
['JCCJ {7.6%,1086581}', 1086581],
['IND {5.9%,839053}', 839053],
['BSP {3.9%,552313}', 552313],
['GGP {1.7%,247459}', 247459],
['AAAP {0.9%,123526}', 123526],
['CPI {0.3%,48255}', 48255],
['APoI {0.3%,42013}', 42013],
['SHS {0.2%,34678}', 34678],
['NCP {0.2%,28983}', 28983],
['SP {0.2%,21969}', 21969],
['BYPP {0.1%,8425}', 8425],
['CPM {0.1%,8348}', 8348],
['JD(U) {0.1%,8285}', 8285],
['CSM {0.1%,7783}', 7783],
['BMUP {0.1%,7419}', 7419],
['BSCP {0.0%,5546}', 5546],
['BTP {0.0%,5498}', 5498],
['RJsbhP {0.0%,5141}', 5141],
['RGOP {0.0%,5040}', 5040],
['IPBP {0.0%,4982}', 4982],
['NINSHAD {0.0%,4586}', 4586],
['PSPU {0.0%,4309}', 4309],
['BHBHP {0.0%,3780}', 3780],
['RPI(A) {0.0%,3257}', 3257],
['JAC {0.0%,3034}', 3034],
['CPIM {0.0%,3017}', 3017],
['NDPF {0.0%,2912}', 2912],
['AASPP {0.0%,2474}', 2474],
['BBC {0.0%,2089}', 2089],
['SWAP {0.0%,2023}', 2023],
['cvgrp {0.0%,1582}', 1582],
['bhmm {0.0%,1474}', 1474],
['AVVP {0.0%,1407}', 1407],
['LSWP {0.0%,1399}', 1399],
['CSP {0.0%,1232}', 1232],
['BPSGKD {0.0%,1093}', 1093],
['BKNP {0.0%,1085}', 1085],
['CGVP {0.0%,1053}', 1053],
['SUCI {0.0%,1048}', 1048],
['SUSP {0.0%,988}', 988],
['DPI {0.0%,970}', 970],
['RJBP {0.0%,717}', 717],
['ASSP {0.0%,701}', 701],
['BLRP {0.0%,570}', 570],
['BSHSP {0.0%,562}', 562],
['ABHM {0.0%,549}', 549],
['SSBD {0.0%,468}', 468],
['ABSSP {0.0%,436}', 436],
['BRSP {0.0%,429}', 429],
['ABSKP {0.0%,389}', 389],
['BSSP {0.0%,279}', 279],
['BNIP {0.0%,267}', 267],
['RMGP {0.0%,258}', 258],
['KMSP {0.0%,241}', 241],
['BHBP {0.0%,224}', 224],
['RP(K) {0.0%,202}', 202],
['CMM {0.0%,192}', 192],
['CHSJP {0.0%,183}', 183],
['RSSM {0.0%,72}', 72],
['AnAP {0.0%,66}', 66],
['NOTA {2.0%,282744}', 282744]]
Then you can loop on the result and save it into the csv file
EDIT:
See this edit to save it in csv file:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
import csv
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
header = ["Party","Vote%","Count","State code"]
results_export = []
results_export.append(header)
for r in result:
export = []
party = r[0].split(' {')[0]
percent = r[0].split(' {')[1].split(',')[0]
count = r[1]
export.append(str(party))
export.append(str(percent))
export.append(str(count))
export.append(code)
results_export.append(export)
file = open(os.path.expanduser("per2014_result.csv"), "w") # 2018
writer = csv.writer(file)
writer.writerows(results_export)
EDIT2:
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
ms = r.findall(s)
result = '[]'
if ms:
for m in ms:
if m != '[]':
result = m
return json.loads(result.replace("'",'"'))

python error: request isn't defined

I am trying to learn how to automatically fetch urls from a page. In the following code I am trying to get the port co-ordinates of the webpage from different links:
import urllib.request
import re
a = input("What country is your port in?: ")
b = input("What is the name of the port?: ")
url = "http://ports.com/"
totalurl = "http://ports.com/" + a + "/" + b + "/"
htmlfile = request.urlopen(url)
htmltext = htmlfile.read()
regex = '<span class="small'+ a + "/" + b + "/" '">...</span>'
pattern = re.compile(regex)
with urllib.request.urlopen(url) as response:
html = response.read().decode()
num = re.findall(pattern, html)
print(num)
This is the error message I receive:
What country is your port in?: greece
What is the name of the port?: port-of-eleusis
Traceback (most recent call last):
File "/Users/kieronblemings/Desktop/PROGRAMS PYTHON/ports extraction.py", line 13, in <module>
htmlfile = request.urlopen(url)
NameError: name 'request' is not defined

Python 3.x unsupported operand type in using encode decode

I am trying to build a generic crawler for my marketing project and keep track of where the information came from viz blogs, testimonials etc. I am using Python 3.5 and Spyder/pycharm as IDE and I keep getting the following error in using encode - decode. The input to my code is a list of company names and product features in an excel file. I also searched for possible solutions but the recommendations in the community are for typecasting, which I am not sure is the problem.
Kindly let me know if some more clarification is required from my side.
from __future__ import division, unicode_literals
import codecs
import re
import os
import xlrd
import requests
from urllib.request import urlopen
from time import sleep
from bs4 import BeautifulSoup
import openpyxl
from collections import Counter
page=0
b=0
n=0
w=0
p=0
o=0
workbook=xlrd.open_workbook("C:\Product.xlsx")
workbook1=xlrd.open_workbook("C:\linkslist.xlsx")
sheet_names = workbook.sheet_names()
sheet_names1 = workbook1.sheet_names()
wb= openpyxl.Workbook() #User Spreadsheet
ws = wb.active
ws.title = "User"
ws['A1'] = 'Feature'
ws['B1'] = 'Customer-Testimonials'
ws['C1'] = 'Case Study'
ws['D1'] = 'Blog'
ws['E1'] = 'Press'
ws['F1'] = 'Total posts'
ws1 = wb.create_sheet(title="Ml")
ws1['A1'] = 'Feature'
ws1['B1'] = 'Phrase'
ws1['C1'] = 'Address'
ws1['D1'] = 'Tag Count'
worksheet = workbook.sheet_by_name(sheet_names[0])
worksheet1 = workbook1.sheet_by_name(sheet_names[0])
for linknumber in range(0,25):
u = worksheet1.cell(linknumber,0).value
url='www.' + u.lower() + '.com'
print (url)
r=''
while r == '':
try:
print ("in loop")
r = requests.get("http://" +url)
except:
sleep(3)#if the code still gives that error then try increasing the sleep time to 5 maybe
print (r)
data = r.text
#print data
soup1 = BeautifulSoup(data, "html.parser")
#print soup1
num=3 #starting row number and keep the column same.
word = ''
word = worksheet.cell(num,3).value
while not word == 'end':
print (num)
#print word
tag_list=[]
phrase= []
counts=[]
address=[]
counts = Counter(tag_list)
for link in soup1.find_all('a'):
#print link
add = link.encode("ascii", "ignore")
print (add)
if not'Log In' in add:
#print link.get('href')
i=0
content = ''
for i in range(1,5):
if content=='':
try:
print (link.get('href'))
i+=1
req = urllib.request.Request(link.get('href'))
with urllib.request.urlopen(req) as response:
content = response.read()
except:
sleep(3)
#if the code still gives that error then try increasing the sleep time to 5 maybe
continue
soup = BeautifulSoup(content, "html.parser")
s=soup(text=re.compile(word))
if s:
print ("TRUE")
add = link.encode('ascii','ignore')
print (type(add))
if 'customer-testimonial' in add :
b+=1
elif 'case-study' in add :
n+=1
elif 'blog' in add :
w+=1
elif 'press' in add :
p+=1
else :
o+=1
#phrase_type=["Customer testimonials","news","ads","twitter","facebook","instagram"]
#print(os.path.join(root, name))
print (add)
for tag in s:
parent_html = tag.parent.name
print (parent_html)
tag_list.append(parent_html)
phrase.append(s)
address.append(add)
#print str(phrase)
counts = Counter(tag_list)
page +=1
else:
counts = Counter(tag_list)
no =num-1
print(counts)
print (word)
ws['A%d'%no] = word.encode('utf-8' , 'ignore')
ws1['A%d'%no] = word.encode('utf-8' , 'ignore')
print ("Number of pages is %d" %page)
print ("Number of Customer testimonials posts is %d" %b)
ws['B%d'%no] = b
print ("Number of Case Studies posts is %d" %n)
ws['C%d'%no] = n
print ("Number of blog posts is %d" %w)
ws['D%d'%no] = w
print ("Number of press posts is %d" %p)
ws['E%d'%no] = p
print ("Number of posts is %d" %page)
ws['F%d'%no] = page
ws1['B%d'%no] = phrase.encode('utf-8' , 'ignore')
ws1['C%d'%no] = address.encode('utf-8' , 'ignore')
ws1['D%d'%no] = counts.encode('utf-8' , 'ignore')
counts.clear()
num += 1
word = worksheet.cell(num,3).value
#print word
page=0
b=0
n=0
w=0
p=0
o=0
phrase=[]
address=[]
tag_list=[]
wb.save('%s.xlsx'%worksheet1.cell(linknumber,0).value)
I get the following output and error while running the code:
www.amobee.com
in loop
<Response [200]>
3
Traceback (most recent call last):
File "C:/project_web_parser.py", line 69, in <module>
add = link.encode("ascii", "ignore")
File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1094, in encode
u = self.decode(indent_level, encoding, formatter)
File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1159, in decode
indent_space = (' ' * (indent_level - 1))
TypeError: unsupported operand type(s) for -: 'str' and 'int'
Process finished with exit code 1
Traceback shows error in line 69 where you try to encode link. To fix it, just change that line to:
add = link.encode("ascii", errors="ignore")
Why does it happen?
Your link variable is type of bs4.element.Tag
>>>type(link)
<class 'bs4.element.Tag'>
.encode() method for tags takes more arguments then .encode() method for strings.
In source code of bs4 in file \bs4\element.py on line 1089 you can find definition of it:
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
First argument is encoding, second is indent_level (int or None) and errors handling is forth.
Error
unsupported operand type(s) for -: 'str' and 'int'
means that you tried to subtract 'ignore' - 1.

Resources