Insert dynamic data to mysql with python - python-3.x
EDITED >>>>>
I write some code which return two outputs but an error appears.
What is the main problem of my code?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import os
import sys
import codecs
from urllib.request import urlopen
import pymysql
import mysql.connector
for i in range(1): #electronic
my_url = "https://www.xxxxx.com/mobile_phones/?facet_is_mpg_child=0&viewType=gridView&page="
uClient = uReq(my_url + str(i))
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div" , {"class" : "sku -gallery" })
for container in containers:
name = container.img["alt"]
title_container = container.findAll("span", {"class" : "brand"})
Brand = title_container[0].text
price = container.findAll("span",{"class" : "price"} )
price_one = price[0].text.strip()
price_old = container.findAll("span",{"class" : "price -old "})
price_two = '0'
if len(price_old) > 0:
price_two = price_old[0].text.strip()
rank = container.findAll("span",{"class" : "rating-aggregate"})
ranking = 'N/A'
if len(rank) > 0:
ranking = rank[0].text.strip()
conn = pymysql.connect(host="localhost",user="root",passwd="",db="prod")
x = conn.cursor()
#name1 = name()
#brand1 = Brand()
#price_one1 = price_one1()
#price_two1= price_one1()
#rank1 = rank()
x.execute("INSERT INTO list (productname,brand,price1,price2,rank) VALUES (%s,%s,%s,%s.%s)" , (name,Brand,price_one,price_two,ranking))
conn.commit()
conn.close()
C:\Users\xxxx\AppData\Local\Programs\Python\Python35\python.exe
C:/Users/xxxx/.PyCharm2018.2/config/scratches/bd.py Traceback (most
recent call last): File
"C:/Users/xxxx/.PyCharm2018.2/config/scratches/bd.py", line 54, in
x.execute("INSERT INTO list (productname,brand,price1,price2,rank) VALUES (%s,%s,%s,%s.%s)" , (name,Brand,price_one,price_two,ranking))
File
"C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\cursors.py",
line 170, in execute
result = self._query(query) File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\cursors.py",
line 328, in _query
conn.query(q) File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 516, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered) File
"C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 727, in _read_query_result
result.read() File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 1066, in read
first_packet = self.connection._read_packet() File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\connections.py",
line 683, in _read_packet
packet.check_error() File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\protocol.py",
line 220, in check_error
err.raise_mysql_exception(self._data) File "C:\Users\xxxx\AppData\Local\Programs\Python\Python35\lib\site-packages\pymysql\err.py",
line 109, in raise_mysql_exception
raise errorclass(errno, errval) pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that
corresponds to your MariaDB server version for the right syntax to use
near '.'2')' at line 1")
Process finished with exit code 1
The problem is with the variable rank. You are supposed to pass ranking but you missed it somehow.
by the code you have given,
rank = container.findAll("span",{"class" : "rating-aggregate"}) # resultset
if len(rank) > 0:
ranking = rank[0].text.strip() #result
So the change is
x.execute("INSERT INTO list (productname,brand,price1,price2,rank) VALUES (%s,%s,%s,%s.%s)" , (name,Brand,price_one,price_two,ranking))
and you are ready to go! I have some suggestions for you. if you are using if condition always give an else condition or a default value for the variable getting declared within the conditional statement. Or else you might end up in error when the condition fails. Like,
rank = container.findAll("span",{"class" : "rating-aggregate"})
ranking = rank[0].text.strip() if len(rank) > 0 else 'N/A'
Or,
rank = container.findAll("span",{"class" : "rating-aggregate"})
ranking = 'N/A'
if len(rank) > 0:
ranking = rank[0].text.strip()
Cheers!
This code stores the information on the csv file, but now I need to save it to mysql.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import os
import sys
import unicodecsv as csv
import codecs
from urllib.request import urlopen
for i in range(3): #electronic
my_url = "https://www.xxxx.com/mobile_phones/?facet_is_mpg_child=0&viewType=gridView&page="
uClient = uReq(my_url + str(i))
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div" , {"class" : "sku -gallery" })
filename = "mobile.csv"
f = codecs.open(filename, "a" , "utf-8-sig")
headers = "name, Brand, price_one, price_two, ranking\n"
f.write(headers)
for container in containers:
name = container.img["alt"]
title_container = container.findAll("span", {"class" : "brand"})
Brand = title_container[0].text
price = container.findAll("span",{"class" : "price"} )
price_one = price[0].text.strip()
price_old = container.findAll("span",{"class" : "price -old "})
price_two = 0
if len(price_old) > 0:
price_two = price_old[0].text.strip()
rank = container.findAll("span",{"class" : "rating-aggregate"})
if len(rank) > 0:
ranking = rank[0].text.strip()
print("name " + name)
print("Brand "+ Brand)
print("price_one " + price_one)
print("price_two {}".format(price_two)) #---->
print("ranking " + ranking)
f.write(name + "," + Brand.replace(",", "|") + "," + price_one.replace(",", "") + "," + price_two.replace(",", "") + "," + ranking + "\n")
f.close()
Related
Python Program ValueError: invalid literal for int() with base 10:
I am a python-beginner, trying to write a program to: "Corona Virus Live Updates for India – Using Python". I am getting this error, after running the code/programe: performance.append(int(row[2]) + int(row[3])) ValueError: invalid literal for int() with base 10: What can I do to fix this problem? The Code: extract_contents = lambda row: [x.text.replace('\n', '') for x in row] URL = 'https://www.mohfw.gov.in/' SHORT_HEADERS = ['SNo', 'State','Indian-Confirmed', 'Foreign-Confirmed','Cured','Death'] response = requests.get(URL).content soup = BeautifulSoup(response, 'html.parser') header = extract_contents(soup.tr.find_all('th')) stats = [] all_rows = soup.find_all('tr') for row in all_rows: stat = extract_contents(row.find_all('td')) if stat: if len(stat) == 5: # last row stat = ['', *stat] stats.append(stat) elif len(stat) == 6: stats.append(stat) stats[-1][1] = "Total Cases" stats.remove(stats[-1]) #Step #3: objects = [] for row in stats : objects.append(row[1]) y_pos = np.arange(len(objects)) performance = [] for row in stats: performance.append(int(row[2]) + int(row[3])) table = tabulate(stats, headers=SHORT_HEADERS) print(table)
just changed the line performance.append(int(row[2]) + int(row[3])) to performance.append(row[2] +str(int(float(row[3]))) ) Full Code: import requests from bs4 import BeautifulSoup import numpy as np from tabulate import tabulate extract_contents = lambda row: [x.text.replace('\n', '') for x in row] URL = 'https://www.mohfw.gov.in/' SHORT_HEADERS = ['SNo', 'State','Indian-Confirmed', 'Foreign-Confirmed','Cured','Death'] response = requests.get(URL).content soup = BeautifulSoup(response, 'html.parser') header = extract_contents(soup.tr.find_all('th')) stats = [] all_rows = soup.find_all('tr') for row in all_rows: stat = extract_contents(row.find_all('td')) if stat: if len(stat) == 5: # last row stat = ['', *stat] stats.append(stat) elif len(stat) == 6: stats.append(stat) stats[-1][1] = "Total Cases" stats.remove(stats[-1]) #Step #3: objects = [row[1] for row in stats ] y_pos = np.arange(len(objects)) performance = [row[2] +str(int(float(row[3]))) for row in stats] table = tabulate(stats, headers=SHORT_HEADERS) print(table)
how to access bestbuy item price
I want to check the price of a item from bestbuy website, however, the access is denied. Does anyone have some advice how to access? Thanks! My code: import requests import bs4 as bs url = "https://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?skuId=6360611" url_get = requests.get(url) soup = bs.BeautifulSoup(url_get.content, 'lxml') with open('url_bestbuy.txt', 'w', encoding='utf-8') as f_out: f_out.write(soup.prettify()) js_test = soup.find('span', id ='priceblock_ourprice') if js_test is None: js_test = soup.find('span', id ='div.price-block') str = "" for line in js_test.stripped_strings : str = line # convert to integer str = str.replace(", ", "") str = str.replace("$", "") current_price = int(float(str)) your_price = 2000 if current_price < your_price : print("I can afford it") else: print("Price is high please wait for the best deal") You don't have permission to access "http://www.bestbuy.com/site/lg-65-class-oled-b9-series-2160p-smart-4k-uhd-tv-with-hdr/6360611.p?" on this server.
How to scrape the data which is unable to inspect and which is inside the <svg> tag
I'm unable to scrape some of the data from a webpage Partywise Result. I want to scrape the partwise{vote%,vote count} from that page. The code I have tried so far: import urllib import urllib.request from bs4 import BeautifulSoup import os def soup(url): thepage = urllib.request.urlopen(url) soupdata = BeautifulSoup(thepage,"html.parser") return soupdata #chhattisgarh edatas = "" edata1="" codes = ["S26"] for code in codes: soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018 #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014 soup2 = soup(soup3) for records2 in soup2.findAll("div",{"id":"piecharts26"}): print(records2.table) for records in records2.findAll("table"): print(records) edata = "" for data in records.findAll('td'): edata= edata+","+data.text edatas= edatas + "\n" + edata[1:]+","+code header ="Party,Won,Leading,Total,State code" file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018 #file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014 file.write(bytes(header, encoding="ascii", errors="ignore")) file.write(bytes(edatas, encoding="ascii", errors="ignore")) file.write(bytes(edata1, encoding="ascii", errors="ignore")) The result which I am expecting from is the %vote share I want the output to be in CSV format like this: INC,43.0%,6144192 and so on fully from the page one and two
There is a loading of the data directly from the javascript inside your div: if(document.getElementById('piecharts26')!=null) So you have to use a console browser, such as selenium (link here) , or use a regex: import urllib import urllib.request from bs4 import BeautifulSoup import os import re import json def get_data(html_page): s = str(html_page) r = re.compile('data.addRows\((.*?)\);') m = r.search(s) if m: result = m.group(1) return json.loads(result.replace("'",'"')) def soup(url): thepage = urllib.request.urlopen(url) soupdata = BeautifulSoup(thepage,"html.parser") return soupdata #chhattisgarh edatas = "" edata1="" codes = ["S26"] for code in codes: soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018 #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014 soup2 = soup(soup3) result = get_data(soup2) print(result) header ="Party,Won,Leading,Total,State code" file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018 #file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014 file.write(bytes(header, encoding="ascii", errors="ignore")) file.write(bytes(edatas, encoding="ascii", errors="ignore")) file.write(bytes(edata1, encoding="ascii", errors="ignore")) OUTPUT: [['INC {43.0%,6144192}', 6144192], ['BJP {33.0%,4707141}', 4707141], ['JCCJ {7.6%,1086581}', 1086581], ['IND {5.9%,839053}', 839053], ['BSP {3.9%,552313}', 552313], ['GGP {1.7%,247459}', 247459], ['AAAP {0.9%,123526}', 123526], ['CPI {0.3%,48255}', 48255], ['APoI {0.3%,42013}', 42013], ['SHS {0.2%,34678}', 34678], ['NCP {0.2%,28983}', 28983], ['SP {0.2%,21969}', 21969], ['BYPP {0.1%,8425}', 8425], ['CPM {0.1%,8348}', 8348], ['JD(U) {0.1%,8285}', 8285], ['CSM {0.1%,7783}', 7783], ['BMUP {0.1%,7419}', 7419], ['BSCP {0.0%,5546}', 5546], ['BTP {0.0%,5498}', 5498], ['RJsbhP {0.0%,5141}', 5141], ['RGOP {0.0%,5040}', 5040], ['IPBP {0.0%,4982}', 4982], ['NINSHAD {0.0%,4586}', 4586], ['PSPU {0.0%,4309}', 4309], ['BHBHP {0.0%,3780}', 3780], ['RPI(A) {0.0%,3257}', 3257], ['JAC {0.0%,3034}', 3034], ['CPIM {0.0%,3017}', 3017], ['NDPF {0.0%,2912}', 2912], ['AASPP {0.0%,2474}', 2474], ['BBC {0.0%,2089}', 2089], ['SWAP {0.0%,2023}', 2023], ['cvgrp {0.0%,1582}', 1582], ['bhmm {0.0%,1474}', 1474], ['AVVP {0.0%,1407}', 1407], ['LSWP {0.0%,1399}', 1399], ['CSP {0.0%,1232}', 1232], ['BPSGKD {0.0%,1093}', 1093], ['BKNP {0.0%,1085}', 1085], ['CGVP {0.0%,1053}', 1053], ['SUCI {0.0%,1048}', 1048], ['SUSP {0.0%,988}', 988], ['DPI {0.0%,970}', 970], ['RJBP {0.0%,717}', 717], ['ASSP {0.0%,701}', 701], ['BLRP {0.0%,570}', 570], ['BSHSP {0.0%,562}', 562], ['ABHM {0.0%,549}', 549], ['SSBD {0.0%,468}', 468], ['ABSSP {0.0%,436}', 436], ['BRSP {0.0%,429}', 429], ['ABSKP {0.0%,389}', 389], ['BSSP {0.0%,279}', 279], ['BNIP {0.0%,267}', 267], ['RMGP {0.0%,258}', 258], ['KMSP {0.0%,241}', 241], ['BHBP {0.0%,224}', 224], ['RP(K) {0.0%,202}', 202], ['CMM {0.0%,192}', 192], ['CHSJP {0.0%,183}', 183], ['RSSM {0.0%,72}', 72], ['AnAP {0.0%,66}', 66], ['NOTA {2.0%,282744}', 282744]] Then you can loop on the result and save it into the csv file EDIT: See this edit to save it in csv file: import urllib import urllib.request from bs4 import BeautifulSoup import os import re import json import csv def get_data(html_page): s = str(html_page) r = re.compile('data.addRows\((.*?)\);') m = r.search(s) if m: result = m.group(1) return json.loads(result.replace("'",'"')) def soup(url): thepage = urllib.request.urlopen(url) soupdata = BeautifulSoup(thepage,"html.parser") return soupdata codes = ["S26"] for code in codes: soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018 #soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014 soup2 = soup(soup3) result = get_data(soup2) header = ["Party","Vote%","Count","State code"] results_export = [] results_export.append(header) for r in result: export = [] party = r[0].split(' {')[0] percent = r[0].split(' {')[1].split(',')[0] count = r[1] export.append(str(party)) export.append(str(percent)) export.append(str(count)) export.append(code) results_export.append(export) file = open(os.path.expanduser("per2014_result.csv"), "w") # 2018 writer = csv.writer(file) writer.writerows(results_export) EDIT2: def get_data(html_page): s = str(html_page) r = re.compile('data.addRows\((.*?)\);') ms = r.findall(s) result = '[]' if ms: for m in ms: if m != '[]': result = m return json.loads(result.replace("'",'"'))
python error: request isn't defined
I am trying to learn how to automatically fetch urls from a page. In the following code I am trying to get the port co-ordinates of the webpage from different links: import urllib.request import re a = input("What country is your port in?: ") b = input("What is the name of the port?: ") url = "http://ports.com/" totalurl = "http://ports.com/" + a + "/" + b + "/" htmlfile = request.urlopen(url) htmltext = htmlfile.read() regex = '<span class="small'+ a + "/" + b + "/" '">...</span>' pattern = re.compile(regex) with urllib.request.urlopen(url) as response: html = response.read().decode() num = re.findall(pattern, html) print(num) This is the error message I receive: What country is your port in?: greece What is the name of the port?: port-of-eleusis Traceback (most recent call last): File "/Users/kieronblemings/Desktop/PROGRAMS PYTHON/ports extraction.py", line 13, in <module> htmlfile = request.urlopen(url) NameError: name 'request' is not defined
Python 3.x unsupported operand type in using encode decode
I am trying to build a generic crawler for my marketing project and keep track of where the information came from viz blogs, testimonials etc. I am using Python 3.5 and Spyder/pycharm as IDE and I keep getting the following error in using encode - decode. The input to my code is a list of company names and product features in an excel file. I also searched for possible solutions but the recommendations in the community are for typecasting, which I am not sure is the problem. Kindly let me know if some more clarification is required from my side. from __future__ import division, unicode_literals import codecs import re import os import xlrd import requests from urllib.request import urlopen from time import sleep from bs4 import BeautifulSoup import openpyxl from collections import Counter page=0 b=0 n=0 w=0 p=0 o=0 workbook=xlrd.open_workbook("C:\Product.xlsx") workbook1=xlrd.open_workbook("C:\linkslist.xlsx") sheet_names = workbook.sheet_names() sheet_names1 = workbook1.sheet_names() wb= openpyxl.Workbook() #User Spreadsheet ws = wb.active ws.title = "User" ws['A1'] = 'Feature' ws['B1'] = 'Customer-Testimonials' ws['C1'] = 'Case Study' ws['D1'] = 'Blog' ws['E1'] = 'Press' ws['F1'] = 'Total posts' ws1 = wb.create_sheet(title="Ml") ws1['A1'] = 'Feature' ws1['B1'] = 'Phrase' ws1['C1'] = 'Address' ws1['D1'] = 'Tag Count' worksheet = workbook.sheet_by_name(sheet_names[0]) worksheet1 = workbook1.sheet_by_name(sheet_names[0]) for linknumber in range(0,25): u = worksheet1.cell(linknumber,0).value url='www.' + u.lower() + '.com' print (url) r='' while r == '': try: print ("in loop") r = requests.get("http://" +url) except: sleep(3)#if the code still gives that error then try increasing the sleep time to 5 maybe print (r) data = r.text #print data soup1 = BeautifulSoup(data, "html.parser") #print soup1 num=3 #starting row number and keep the column same. word = '' word = worksheet.cell(num,3).value while not word == 'end': print (num) #print word tag_list=[] phrase= [] counts=[] address=[] counts = Counter(tag_list) for link in soup1.find_all('a'): #print link add = link.encode("ascii", "ignore") print (add) if not'Log In' in add: #print link.get('href') i=0 content = '' for i in range(1,5): if content=='': try: print (link.get('href')) i+=1 req = urllib.request.Request(link.get('href')) with urllib.request.urlopen(req) as response: content = response.read() except: sleep(3) #if the code still gives that error then try increasing the sleep time to 5 maybe continue soup = BeautifulSoup(content, "html.parser") s=soup(text=re.compile(word)) if s: print ("TRUE") add = link.encode('ascii','ignore') print (type(add)) if 'customer-testimonial' in add : b+=1 elif 'case-study' in add : n+=1 elif 'blog' in add : w+=1 elif 'press' in add : p+=1 else : o+=1 #phrase_type=["Customer testimonials","news","ads","twitter","facebook","instagram"] #print(os.path.join(root, name)) print (add) for tag in s: parent_html = tag.parent.name print (parent_html) tag_list.append(parent_html) phrase.append(s) address.append(add) #print str(phrase) counts = Counter(tag_list) page +=1 else: counts = Counter(tag_list) no =num-1 print(counts) print (word) ws['A%d'%no] = word.encode('utf-8' , 'ignore') ws1['A%d'%no] = word.encode('utf-8' , 'ignore') print ("Number of pages is %d" %page) print ("Number of Customer testimonials posts is %d" %b) ws['B%d'%no] = b print ("Number of Case Studies posts is %d" %n) ws['C%d'%no] = n print ("Number of blog posts is %d" %w) ws['D%d'%no] = w print ("Number of press posts is %d" %p) ws['E%d'%no] = p print ("Number of posts is %d" %page) ws['F%d'%no] = page ws1['B%d'%no] = phrase.encode('utf-8' , 'ignore') ws1['C%d'%no] = address.encode('utf-8' , 'ignore') ws1['D%d'%no] = counts.encode('utf-8' , 'ignore') counts.clear() num += 1 word = worksheet.cell(num,3).value #print word page=0 b=0 n=0 w=0 p=0 o=0 phrase=[] address=[] tag_list=[] wb.save('%s.xlsx'%worksheet1.cell(linknumber,0).value) I get the following output and error while running the code: www.amobee.com in loop <Response [200]> 3 Traceback (most recent call last): File "C:/project_web_parser.py", line 69, in <module> add = link.encode("ascii", "ignore") File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1094, in encode u = self.decode(indent_level, encoding, formatter) File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1159, in decode indent_space = (' ' * (indent_level - 1)) TypeError: unsupported operand type(s) for -: 'str' and 'int' Process finished with exit code 1
Traceback shows error in line 69 where you try to encode link. To fix it, just change that line to: add = link.encode("ascii", errors="ignore") Why does it happen? Your link variable is type of bs4.element.Tag >>>type(link) <class 'bs4.element.Tag'> .encode() method for tags takes more arguments then .encode() method for strings. In source code of bs4 in file \bs4\element.py on line 1089 you can find definition of it: def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): First argument is encoding, second is indent_level (int or None) and errors handling is forth. Error unsupported operand type(s) for -: 'str' and 'int' means that you tried to subtract 'ignore' - 1.