I build a web cralwer with scrapy and I store data to mysql database (I crawl the source code from a url) and now I would like to do offline edit . So I have create sql queries to export data with python and the I try to crawl from this.
Could you please suggest how to do it. I can't do it with scrapy of actually. I failed to do it with scrapy, if anyone has anys uggestion or similar project and can help me out .
I have tried with scrapy query the database and store data to
from scrapy.http import HtmlResponse
import mysql
from mysql.connector import Error
import scrapy
import re
import requests
from bs4 import BeautifulSoup
# this here connect to database and q all url that have been crawled and store it into records.
class database:
def query():
try:
connection = mysql.connector.connect(host='',
database='',
user='',
password='')
cursor = connection.cursor(prepared=True)
if connection.is_connected():
db_Info = connection.get_server_info()
done = "Connected to MySQL database... MySQL Server version on "
sql_select_Query = """ SELECT `job_url`, `job_description` FROM `store_all` WHERE job_url LIKE '%kariera.gr% """
cursor = connection.cursor()
cursor.execute(sql_select_Query)
records = cursor.fetchall()
except mysql.connector.Error as error:
not_done = "Failed to connect {}".format(error)
return records
def insert(job_url, metakey, metavalue):
try:
connection = mysql.connector.connect(host='',
database='',
user='',
password='')
cursor = connection.cursor(prepared=True)
sql_insert_query = """ INSERT INTO `store`( `url`, `metakey`, `metavalue`, ) VALUES (%s,%s,%s)"""
insert_tuple = (job_url, metakey, metavalue)
result = cursor.execute(sql_insert_query, insert_tuple)
connection.commit()
done = "Record inserted successfully into python_users table"
except mysql.connector.Error as error:
connection.rollback()
not_done = "Failed to insert into MySQL table {}".format(error)
return done
class Crawler(scrapy.Spider,database):
records =database.query()
records=records[0]
response = HtmlResponse(url="Any String", body=records,encoding='utf-8')
job=response.xpath('//ul[#class="tab_content"]/text()').extract()
url= records
metakey= "test"
metavalue= "test"
print(database.query())
print(database.insert(url,metakey,metavalue))
the issue is solved actually
b = ''.join(body1)
response = TextResponse(url="Any String", body=b,encoding='utf-8')
Related
I'm calling a simple python function in google cloud but cannot get it to save. It shows this error:
"Function failed on loading user code. This is likely due to a bug in the user code. Error message: Error: please examine your function logs to see the error cause: https://cloud.google.com/functions/docs/monitoring/logging#viewing_logs. Additional troubleshooting documentation can be found at https://cloud.google.com/functions/docs/troubleshooting#logging. Please visit https://cloud.google.com/functions/docs/troubleshooting for in-depth troubleshooting documentation."
Logs don't seem to show much that would indicate error in the code. I followed this guide: https://blog.thereportapi.com/automate-a-daily-etl-of-currency-rates-into-bigquery/
With the only difference environment variables and the endpoint I'm using.
Code is below, which is just a get request followed by a push of data into a table.
import requests
import json
import time;
import os;
from google.cloud import bigquery
# Set any default values for these variables if they are not found from Environment variables
PROJECT_ID = os.environ.get("PROJECT_ID", "xxxxxxxxxxxxxx")
EXCHANGERATESAPI_KEY = os.environ.get("EXCHANGERATESAPI_KEY", "xxxxxxxxxxxxxxx")
REGIONAL_ENDPOINT = os.environ.get("REGIONAL_ENDPOINT", "europe-west1")
DATASET_ID = os.environ.get("DATASET_ID", "currency_rates")
TABLE_NAME = os.environ.get("TABLE_NAME", "currency_rates")
BASE_CURRENCY = os.environ.get("BASE_CURRENCY", "SEK")
SYMBOLS = os.environ.get("SYMBOLS", "NOK,EUR,USD,GBP")
def hello_world(request):
latest_response = get_latest_currency_rates();
write_to_bq(latest_response)
return "Success"
def get_latest_currency_rates():
PARAMS={'access_key': EXCHANGERATESAPI_KEY , 'symbols': SYMBOLS, 'base': BASE_CURRENCY}
response = requests.get("https://api.exchangeratesapi.io/v1/latest", params=PARAMS)
print(response.json())
return response.json()
def write_to_bq(response):
# Instantiates a client
bigquery_client = bigquery.Client(project=PROJECT_ID)
# Prepares a reference to the dataset
dataset_ref = bigquery_client.dataset(DATASET_ID)
table_ref = dataset_ref.table(TABLE_NAME)
table = bigquery_client.get_table(table_ref)
# get the current timestamp so we know how fresh the data is
timestamp = time.time()
jsondump = json.dumps(response) #Returns a string
# Ensure the Response is a String not JSON
rows_to_insert = [{"timestamp":timestamp,"data":jsondump}]
errors = bigquery_client.insert_rows(table, rows_to_insert) # API request
print(errors)
assert errors == []
I tried just the part that does the get request with an offline editor and I can confirm a response works fine. I suspect it might have to do something with permissions or the way the script tries to access the database.
import pandas
import pygrametl
import psycopg2
from pygrametl.tables import SlowlyChangingDimension,CachedDimension,BulkDimension
from pygrametl.datasources import CSVSource
##Connection to PostGres
connection = psycopg2.connect(host="localhost",database="postgres", user="postgres",
password="tekihcan")
connect = pygrametl.ConnectionWrapper(connection)
def pgcopybulkloader(name, atts, fieldsep, rowsep, nullval, filehandle):
# Here we use driver-specific code to get fast bulk loading.
# You can change this method if you use another driver or you can
# use the FactTable or BatchFactTable classes (which don't require
# use of driver-specifc code) instead of the BulkFactTable class.
global connection
curs = connect.cursor()
try:
curs.copy_from(file=filehandle, table=name, sep=fieldsep,
columns=atts,null='null')
except(Exception, psycopg2.Database) as error:
print("Error %s" % error)
date_dim = BulkDimension(name='date_dim',key='d_date_sk',attributes=[
'd_date_id (B)'
,'d_date'
,'d_month_seq'
,'d_week_seq'
,'d_quarter_seq'
,'d_year'
,'d_dow'
,'d_moy'
,'d_dom'
,'d_qoy'
,'d_fy_year'
,'d_fy_quarter_seq'
,'d_fy_week_seq'
,'d_day_name'
,'d_quarter_name'
,'d_holiday'
,'d_weekend'
,'d_following_holiday'
,'d_first_dom'
,'d_last_dom'
,'d_same_day_ly'
,'d_same_day_lq'
,'d_current_day'
,'d_current_week'
,'d_current_month'
,'d_current_quarter'
,'d_current_year'
],lookupatts = ['d_date_id (B)'],
bulkloader = pgcopybulkloader)
date_dim_source = CSVSource(open('C:/Users/HP\Documents/v2.13.0rc1/data/date_dim.csv',
'r', 16384),delimiter='|')
def main():
for row in date_dim_source:
date_dim.insert(row)
The code is failing with error -
As per my understanding the error is caused because the target table is empty. The CSV source doesn't have header as well. Could this be impacting the code?
Please find the link that was used to develop the code - https://chrthomsen.github.io/pygrametl/
I am migrating data from IBM to Snowflake in
3 stages- extract, transform and load.
Below is the python code that connects source IBM and destination Snowflake which does the ETL.
is there any way I can create a class/ package out of the entire below code?
import snowflake.connector
tableName='F58001'
ctx = snowflake.connector.connect(
user='*',
password='*',
account='*.azure'
)
cs = ctx.cursor()
ctx.cursor().execute("USE DATABASE STORE_PROFILE")
ctx.cursor().execute("USE SCHEMA LANDING")
try:
ctx.cursor().execute("PUT file:///temp/data/{tableName}/* #%{tableName}".format(tableName=tableName))
except Exception:
pass
ctx.cursor().execute("truncate table {tableName}".format(tableName=tableName))
ctx.cursor().execute("COPY INTO {tableName} ON_ERROR = 'CONTINUE' ".format(tableName=tableName,
FIELD_OPTIONALLY_ENCLOSED_BY = '""', sometimes=',', ERROR_ON_COLUMN_COUNT_MISMATCH = 'TRUE'))
last_query_id= ctx.cursor().execute("select last_query_id()")
for res in last_query_id:
query_id = res[0]
ctx.cursor().execute(f"create or replace table save_copy_errors as select * from
table(validate("+tableName+", job_id=> "+"'"+query_id+"'"+"))")
ax = ctx.cursor().execute("select * from save_copy_errors")
for errors in ax:
error = errors
print(error)
ctx.close()
Please look at the below repository. It probably has answer to your question. I am currently working on moving it to PYPI so that it can be installed with PIP
https://github.com/Infosys/Snowflake-Python-Development-Framework
My task is to write a Python script that can take results from BigQuery and email them out. I've written a code that can successfully send an email, but I am having trouble including the results of the BigQuery script in the actual email. The query results are correct, but the actual object I am returning from the query (results) always returns as a Nonetype.
For example, the email should look like this:
Hello,
You have the following issues that have been "open" for more than 7 days:
-List issues here from bigquery code
Thanks.
The code reads in contacts from a contacts.txt file, and it reads in the email message template from a message.txt file. I tried to make the bigquery object into a string, but it still results in an error.
from google.cloud import bigquery
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from string import Template
def query_emailtest():
client = bigquery.Client(project=("analytics-merch-svcs-thd"))
query_job = client.query("""
select dept, project_name, reset, tier, project_status, IssueStatus, division, store_number, top_category,
DATE_DIFF(CURRENT_DATE(), in_review, DAY) as days_in_review
from `analytics-merch-svcs-thd.MPC.RESET_DETAILS`
where in_review IS NOT NULL
AND IssueStatus = "In Review"
AND DATE_DIFF(CURRENT_DATE(), in_review, DAY) > 7
AND ready_for_execution IS NULL
AND project_status = "Active"
AND program_name <> "Capital"
AND program_name <> "SSI - Capital"
LIMIT 50
""")
results = query_job.result() # Waits for job to complete.
return results #THIS IS A NONETYPE
def get_queryresults(results): #created new method to put query results into a for loop and store it in a variable
for i,row in enumerate(results,1):
bq_data = (i , '. ' + str(row.dept) + " " + row.project_name + ", Reset #: " + str(row.reset) + ", Store #: " + str(row.store_number) + ", " + row.IssueStatus + " for " + str(row.days_in_review)+ " days")
print (bq_data)
def get_contacts(filename):
names = []
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
names.append(a_contact.split()[0])
emails.append(a_contact.split()[1])
return names, emails
def read_template(filename):
with open(filename, 'r', encoding='utf-8') as template_file:
template_file_content = template_file.read()
return Template(template_file_content)
names, emails = get_contacts('mycontacts.txt') # read contacts
message_template = read_template('message.txt')
results = query_emailtest()
bq_results = get_queryresults(query_emailtest())
import smtplib
# set up the SMTP server
s = smtplib.SMTP(host='smtp-mail.outlook.com', port=587)
s.starttls()
s.login('email', 'password')
# For each contact, send the email:
for name, email in zip(names, emails):
msg = MIMEMultipart() # create a message
# bq_data = get_queryresults(query_emailtest())
# add in the actual person name to the message template
message = message_template.substitute(PERSON_NAME=name.title())
message = message_template.substitute(QUERY_RESULTS=bq_results) #SUBSTITUTE QUERY RESULTS IN MESSAGE TEMPLATE. This is where I am having trouble because the Row Iterator object results in Nonetype.
# setup the parameters of the message
msg['From']='email'
msg['To']='email'
msg['Subject']="This is TEST"
# body = str(get_queryresults(query_emailtest())) #get query results from method to put into message body
# add in the message body
# body = MIMEText(body)
#msg.attach(body)
msg.attach(MIMEText(message, 'plain'))
# query_emailtest()
# get_queryresults(query_emailtest())
# send the message via the server set up earlier.
s.send_message(msg)
del msg
Message template:
Dear ${PERSON_NAME},
Hope you are doing well. Please find the following alert for Issues that have been "In Review" for greater than 7 days.
${QUERY_RESULTS}
If you would like more information, please visit this link that contains a complete dashboard view of the alert.
ISE Services
The BQ result() function returns a generator, so I think you need to change your return to yield from.
I'm far from a python expert, but the following pared-down code worked for me.
from google.cloud import bigquery
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")
def query_emailtest():
client = bigquery.Client(project=("my_project"))
query_job = client.query("""
select field1, field2 from `my_dataset.my_table` limit 5
""")
results = query_job.result()
yield from results # NOTE THE CHANGE HERE
results = query_emailtest()
for row in results:
print(row.field1, row.field2)
I am trying to change the directory of the couch database. I am using a python script to import a csv file to the CouchDB. Script is running ok. Here it is just in case:
from couchdbkit import Server, Database
from couchdbkit.loaders import FileSystemDocsLoader
from csv import DictReader
import sys, subprocess, math, os
def parseDoc(doc):
for k,v in doc.items():
if (isinstance(v,str)):
#print k, v, v.isdigit()
# #see if this string is really an int or a float
if v.isdigit()==True: #int
doc[k] = int(v)
else: #try a float
try:
if math.isnan(float(v))==False:
doc[k] = float(v)
except:
pass
return doc
def upload(db, docs):
db.bulk_save(docs)
del docs
return list()
def uploadFile(fname, dbname):
#connect to the db
theServer = Server()
db = theServer.get_or_create_db(dbname)
#loop on file for upload
reader = DictReader(open(fname, 'rU'), dialect = 'excel')
docs = list()
checkpoint = 100
i = 0
for doc in reader:
newdoc = parseDoc(doc)
docs.append(newdoc)
if len(docs)%checkpoint==0:
docs = upload(db,docs)
i += 1
print 'Number : %d' %i
#don't forget the last batch
docs = upload(db,docs)
if __name__=='__main__':
x = '/media/volume1/Crimes_-_2001_to_present.csv'
filename = x
dbname = 'test'
uploadFile(filename, dbname)
I saw plenty posts on how to change the directory for appending the database. If I leave the /etc/couchdb/local.ini as it is (original after installation) the script is appending data to the default directory /var/lib/couchdb/1.0.1/. When I modify the local.ini to store the database to another disk:
database_dir = /media/volume1
view_index_dir = /media/volume1
and after the reboot of the CouchDB service I get this error :
restkit.errors.RequestError: socket.error: [Errno 111] Connection refused
I have checked the open sockets (couchdb uses 5984 as default) and it is not opened. But I get no errors when I start CouchDB service.
Any ideas how to fix it ?
I think the error may be due to you have changed the directory location in Local.ini but when you are trying to make new connection to existing database, it cannot find it there.
So move the database_name.couch file to new location which you can put in local.ini and then try to make a connection. I think this should work.