Multi thread in Python search engine crawler causing a hang - multithreading

I'm trying to teach myself programming and I've run into a wall with multithreads, I'm using it try and speed up my google crawler, can someone point me in the right direction?
# Requires a search string and page numbers to scan
def google(search_string, start):
temp = []
url = 'http://www.google.com/search'
payload = { 'q' : search_string, 'start' : start }
my_headers = { 'User-agent' : 'Mozilla/11.0' }
r = requests.get( url, params = payload, headers = my_headers )
r.text.encode('utf-8')
soup = BeautifulSoup( r.text, 'html.parser' )
h3tags = soup.find_all( 'h3', class_='r' )
# Prints and writes output of scrapped URLs
with open(str("test.txt"), "w") as out_15:
for h3 in h3tags:
try:
print( re.search('url\?q=(.+?)\&sa', h3.a['href']).group(1) )
temp.append( re.search('url\?q=(.+?)\&sa', h3.a['href']).group(1) )
except:
continue
return temp
and
def main():
start = timer()
result = []
search = input ("Please enter Dork String(<dork - no include inurl:> <extra-terms>):")
pages = int(input ("how many URL's would you like?:"))
pages = pages / 10
processes = int(input ("How many threads (<= 8):"))
make_request = partial( google, search )
pagelist = [ str(x*10) for x in range( 0, int(pages) ) ]
#Multithreads ??
with Pool(processes) as p:
tmp = p.map(make_request, pagelist)
for x in tmp:
result.extend(x)
result = list( set( result ) )
print ( *result, sep = '/n' )
#stats
print ( '\nTotal URLs Scraped : %s ' % str( len( result ) ) )
print ( 'Script Execution Time : %s ' % ( timer() - start, ) )

Related

How to get the google search console data using access_token or refresh token in python?

I'm trying to get the data from google search console in behalf of the user once they login it returns the access_token and refresh_token by using the access_token or refresh_token how to get the Google Search Console data (imperssion,click,pages).
Same way i am getting the data from Google Analytics but in google search console it's not possible.
def extract_data(site, creds, num_days, output):
domain_name = get_domain_name(site)
create_project(domain_name)
full_path = domain_name + '/' + output
current_dates = get_dates_from_csv(full_path)
webmasters_service = authorize_creds(creds)
# Set up Dates
end_date = datetime.date.today() - relativedelta.relativedelta(days=3)
start_date = end_date - relativedelta.relativedelta(days=num_days)
delta = datetime.timedelta(days=1) # This will let us loop one day at the time
scDict = defaultdict(list)
while start_date <= end_date:
if current_dates is not None and current_dates.str.contains(
datetime.datetime.strftime(start_date, '%Y-%m-%d')).any():
start_date += delta
else:
# print('Start date at beginning: %s' % start_date)
maxRows = 25000 # Maximum 25K per call
numRows = 0 # Start at Row Zero
status = '' # Initialize status of extraction
# print("status status status status",status)
while (status != 'Finished'): # Test with i < 10 just to see how long the task will take to process.
request = {
'startDate': datetime.datetime.strftime(start_date, '%Y-%m-%d'),
'endDate': datetime.datetime.strftime(start_date, '%Y-%m-%d'),
'dimensions': ['date', 'page', 'query'],
'rowLimit': maxRows,
'startRow': numRows
}
response = execute_request(webmasters_service, site, request)
try:
# Process the response
for row in response['rows']:
scDict['date'].append(row['keys'][0] or 0)
scDict['page'].append(row['keys'][1] or 0)
scDict['query'].append(row['keys'][2] or 0)
scDict['clicks'].append(row['clicks'] or 0)
scDict['ctr'].append(row['ctr'] or 0)
scDict['impressions'].append(row['impressions'] or 0)
scDict['position'].append(row['position'] or 0)
# print('successful at %i' % numRows)
except:
print('error occurred at %i' % numRows)
# Add response to dataframe
df = pd.DataFrame(data=scDict)
df['clicks'] = df['clicks'].astype('int')
df['ctr'] = df['ctr'] * 100
df['impressions'] = df['impressions'].astype('int')
df['position'] = df['position'].round(2)
print('Numrows at the start of loop: %i' % numRows)
try:
numRows = numRows + len(response['rows'])
except:
status = 'Finished'
print('Numrows at the end of loop: %i' % numRows)
if numRows % maxRows != 0:
status = 'Finished'
start_date += delta
print('Start date at end: %s' % start_date)
write_to_csv(df, full_path)
return df
This is code i am getting in google search console this code using the webmasters_service = authorize_creds(creds) method but i want to access using access_token or refresh token.
This is the code used in google analytics.
def google_analytics_reporting_api_data_extraction(viewID, dim, met, start_date,
end_date, refresh_token,
transaction_type, goal_number,
condition):
viewID = viewID;
dim = dim;
met = met;
start_date = start_date;
end_date = end_date;
refresh_token = refresh_token;
transaction_type = transaction_type;
condition = condition
goal_number = goal_number
viewID = "".join(['ga%3A', viewID])
if transaction_type == "Goal":
met1 = "%2C".join([re.sub(":", "%3A", i) for i in met]).replace("XX", str(goal_number))
elif transaction_type == "Transaction":
met1 = "%2C".join([re.sub(":", "%3A", i) for i in met])
dim1 = "%2C".join([re.sub(":", "%3A", i) for i in dim])
credentials = client.OAuth2Credentials(
access_token=None, client_id=client_id, client_secret=client_secret, refresh_token=refresh_token,
token_expiry=3600, token_uri=GOOGLE_TOKEN_URI, user_agent='my-user-agent/1.0', revoke_uri=GOOGLE_REVOKE_URI)
credentials.refresh(httplib2.Http())
rt = (json.loads(credentials.to_json()))['access_token']
api_url = "https://www.googleapis.com/analytics/v3/data/ga?ids="
url = "".join(
[api_url, viewID, '&start-date=', start_date, '&end-date=', end_date, '&metrics=', met1, '&dimensions=',
dim1, '&max-results=1000000', condition, '&access_token=', rt])
data = pd.DataFrame()
dataa = pd.DataFrame()
users = []
final_date = []
# try:
r = requests.get(url)
# print("r values",list((r.json())['rows']))
# print("start_date",start_date)
start = datetime.datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.datetime.strptime(end_date, "%Y-%m-%d")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end - start).days)]
for each in date_generated:
date_value = each.date()
url = "".join(
[api_url, viewID, '&start-date=', str(each.date()), '&end-date=', str(each.date()), '&metrics=', met1,
'&dimensions=',
dim1, '&max-results=1000000', condition, '&access_token=', rt])
rr = requests.get(url)
dataa = pd.DataFrame(list((rr.json())['rows']))
users.append(dataa[0][0])
final_date.append(str(date_value))
# print("data and users", users, final_date)
data = pd.DataFrame(list((r.json())['rows']))
try:
data = pd.DataFrame(list((r.json())['rows']), columns=[re.sub("ga:", "", i) for i in met])
# data['date'] = start_date
# dim_data = pd.DataFrame(list((r.json())['rows']), columns=[re.sub("ga:", "", i) for i in dim])
return data, users, final_date
except:
print((r.json()))
In the above code by using refresh_token we access the data from google analytics. Like this way only i want the code in google search console.
Please help me out

Make multiprocessing Pool to use free cores if available

I have the following piece of code that uses a Pool of workes to perform some operations.
def my_func( args ):
low_index = args[0][0]
up_index = args[0][1]
params = args[1][0]
A = args[1][1]
B = args[1][2]
print( "PID:", mp.current_process() )
for k in range( low_index, up_index ):
a = params[k]
# what if np.dot uses multi-threading?
A = a*A + ( np.dot( A , B ) )*( np.dot( B, B ) )
B = a*B + ( np.dot( B , A ) )*( np.dot( A, A ) )
return A,B
if __name__ == '__main__':
ts = time()
import numpy as np
params = np.linspace( 1, 10, 1000 )
n_dim = 1000
# the arrays A,B get modified with each call to the worker
A = np.random.rand( n_dim, n_dim )
B = np.random.rand( n_dim, n_dim )
C = np.random.rand( 5*n_dim, 5*n_dim )
D = np.random.rand( 5*n_dim, 5*n_dim )
ncpus = psutil.cpu_count( logical=False )
number_processes = ncpus - 1
total_items = params.shape[0]
n_chunck = int( ( total_items )/number_processes )
intervals = [ [ k*n_chunck, (k+1)*n_chunck ] for k in range( number_processes ) ]
intervals[ -1 ][ -1 ] = total_items
from itertools import repeat
objs_ = list( repeat( ( params,
copy.deepcopy( A ),
copy.deepcopy( B ),
) , number_processes - 1 ) )
objs_.append( ( params,
copy.deepcopy( C ),
copy.deepcopy( D ),
) )
args_l = []
for k in range( number_processes ):
args_l.append( [ intervals[k] , objs_[k] ] )
pool = mp.Pool( processes = ncpus )
results = pool.map( my_func, args_l )
pool.close()
pool.join()
print( time() - ts )
The last process (involving the C and D arrays) will take considerably longer than the rest; therefore, I want that once the other processes are done, the remaining process (the one with bigger dimensions of the array) can make efficient use of all available free cores. However, I am observing that the CPU usage stays around 20% (In my machine I use 5 cores out of 6) for the last process, therefore being highly inefficient in the remaining operations. Is there a good way to fix that?

Make my EC2 instance to utilize more CPU power

I've chosen the t2.2xlarge instance with 8 CPUs and 32GiB in memory. However, I feel like the performance is the same as compared to the "free tier" version I used to run my python script on. When I look at the CPU usage on my machine, it says only 8%.
How can I utilize much more of my CPUs?
Here is the following code I'm currently running on these EC2 Instances:
def connectToDB():
databaseServerIP = "mydb.us-east-2.rds.amazonaws.com" # IP address of the MySQL database server
databaseUserName = "mydbUsername" # User name of the database server
databaseUserPassword = "mypwd" # Password for the database user
cursorType = pymysql.cursors.DictCursor
connectionInstance = pymysql.connect(host=databaseServerIP,
user=databaseUserName,
password=databaseUserPassword,
cursorclass=cursorType,
autocommit=True)
# Create a cursor object
cursorInstance = connectionInstance.cursor()
return connectionInstance, cursorInstance
def construct_each_company(tmpDF_forPeerGroup, ii):
print(tmpDF_forPeerGroup['Name'].values[ii])
finalBigDataframe = pd.DataFrame(date_generated, index = date_generated)
#symbolToCheck = tmpDF_forPeerGroup['Symbol'].values[ii]
idx = tmpDF_forPeerGroup.index[ii]
#####################
#####################
##### dataframe 1
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB1.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_1 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_1 = None
#####################
#####################
##### dataframe 2
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB2.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_2 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_2 = None
#####################
#####################
##### dataframe 3
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB3.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_3 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_3 = None
#####################
#####################
##### dataframe 4
try:
connectionInstance, cursorInstance = connectToDB()
sql = "SELECT * FROM DB4.Scores WHERE company_idx = "+str(idx)
finalBigDataframe_4 = pd.read_sql(sql, con=connectionInstance)
except:
finalBigDataframe_4 = None
##################
##################
##################
##################
# merge for every input
# this is not right though...
tmpList_forThisCompany = [finalBigDataframe_1, finalBigDataframe_2, finalBigDataframe_3, finalBigDataframe_4]
return (ii, tmpList_forThisCompany)
def collect_result(result):
global results
results.append(result)
import multiprocessing as mp
for elem_PeerGroup in list(sorted(finalDict))[:]:
print(elem_PeerGroup)
#elem_PeerGroup = 'Africa - Banks'
########################################
### FOR ALL COMPANIES IN THIS PEER GROUP
tmpDF_forPeerGroup = finalDict[elem_PeerGroup]
if len(tmpDF_forPeerGroup)!=0:
########################
## CREATE A FINAL LIST FOR COMPANIES
#finalListForCompanies = []
########################
## CREATE DATETIME RANGE
start = datetime.strptime("01-01-2004", "%d-%m-%Y")
end = datetime.strptime("06-04-2019", "%d-%m-%Y")
date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days)]
# each process will use each CPU
#pool = mp.Pool(mp.cpu_count())
pool = mp.Pool(2)
results=[]
for ii in range(0, len(tmpDF_forPeerGroup)):
pool.apply_async(construct_each_company, args=(tmpDF_forPeerGroup, ii), callback=collect_result)
pool.close()
# postpones the execution of next line of code until all processes in the queue are done.
pool.join()
# Step 5: Sort results [OPTIONAL]
results.sort(key=lambda x: x[0])
finalListForCompanies = [r for (ii, r) in results]
else:
continue
finalScores = []
# for each dataframe, NORMALIZE the companies in the PEER GROUP
for kk in range(4):
#print(kk)
tmpListForNormalisation=[]
for elem in finalListForCompanies:
tmpListForNormalisation.append(elem[kk])
dict_of_dfs = dict(enumerate(tmpListForNormalisation))
try:
dframes = pd.concat(dict_of_dfs)
except:
finalScores.append(None)
continue
dframes = dframes.iloc[:,1:]
if len(dframes)==0:
finalScores.append(None)
continue
if len(dframes)==len(dframes.groupby(level=1)):
arrayTest=[]
for k in range(len(tmpListForNormalisation)):
if (tmpListForNormalisation[k] is None) or (len(tmpListForNormalisation[k])==0):
arrayTest.append(None)
else:
arrayTest.append(tmpListForNormalisation[k])
# put the final result into a list
dict_of_dfs2 = dict(enumerate(arrayTest))
finalScores.append(dict_of_dfs2)
else:
test = dframes.groupby(level=1).pipe(lambda g: dframes.sub(g.mean(), level=1).div(g.std(), level=1))
tmpListForNormalisation2=[]
for date, new_df in test.groupby(level=0):
tmpListForNormalisation2.append(new_df)
arrayTest=[]
j=0
for k in range(len(tmpListForNormalisation)):
if (tmpListForNormalisation[k] is None) or (len(tmpListForNormalisation[k])==0):
arrayTest.append(None)
else:
arrayTest.append(tmpListForNormalisation2[j])
j+=1
test_min = test.min(level=1)
test_max = test.max(level=1)
dict_of_dfs2 = dict(enumerate(arrayTest))
def nrm(d):
_d = d
_d.index = _d.index.get_level_values(1)
NewRange = np.array([0, 100])
o = test_max - test_min
n = NewRange[1] - NewRange[0]
return (((_d - test_min) * n) / o) + NewRange[0]
for k, d in dict_of_dfs2.items():
if d is None:
continue
d.loc[:] = nrm(d).rolling(window=7).mean()
# put the final result into a list
finalScores.append(dict_of_dfs2)
# take the final MEAN for every company
for ll in range(len(tmpDF_forPeerGroup)):
namex = tmpDF_forPeerGroup['Name'].values[ll]
print("Inserting to DB...", namex)
company_idx = tmpDF_forPeerGroup['Company_idx'].values[ll]
company_symbol = tmpDF_forPeerGroup['Symbol'].values[ll]
industryName = tmpDF_forPeerGroup['GICS_Industry_Name'].values[ll]
try:
val1 = finalScores[0][ll]
except:
val1 = None
try:
val2 = finalScores[1][ll]
except:
val2 = None
try:
val3 = finalScores[2][ll]
except:
val3 = None
try:
val4 = finalScores[3][ll]
except:
val4 = None
tmpList = [val1, val2, val3, val4]
tmpDf = dict(enumerate(tmpList))
dframes = pd.concat(tmpDf)
finfin = dframes.mean(level=1)
# adjust according to its industry weights
finfin = adjustWeights(industryName, finfin)
# take data from 01.01.2007 onwards only
finfin = finfin['2007/01/01':]
#####################
# NOW PUT TO DATABASE
engine = create_engine("mysql://mydb.us-east-2.rds.amazonaws.com/"+newDatabaseName)
con = engine.connect()
finfin['timestamp'] = finfin.index
finfin['company_idx'] = [company_idx]*len(finfin)
finfin['company_symbol'] = [company_symbol]*len(finfin)
finfin.to_sql(name='Scores', con=con, if_exists='append', index=False)
I don't see why my VM is only using 8% of my CPU in this case. I don't see any error in my code as it should loop over many different companies and allocate one CPU per company.

Bokeh charts unresponsive on rangeslider on_change

I am working on bokeh charts for the first time. I have followed a few tutorials but due to some reason, update function is not working on rangeslider on_change()
def make_data(df, start, end):
#df['ID'] = range(1, len(df) + 1)
s = df['ID'] >= start
e = df['ID'] <= end
df1 = df[e & s]
date = df1['date'].tolist()
capi = df1['capi'].tolist()
data = {'x': dateTime(date), 'y': capi}
source = ColumnDataSource(data)
return source
def update(attr, old, new):
df = pd.DataFrame.from_csv("main_data.csv", index_col = None)
df['ID'] = range(1, len(df) + 1)
new_src = make_dataset(df, range_start = range_select.value[0], range_end = range_select.value[1])
source.data.update(new_src.data)
def make_plot(source):
p1 = figure(x_axis_type="datetime", title="Stock Closing Prices")
p1.grid.grid_line_alpha=0.3
p1.xaxis.axis_label = 'Date'
p1.yaxis.axis_label = 'Price'
p1.line('x', 'y', source = source, color='#A6CEE3', legend='capi')
return p1
range_select = RangeSlider(title="Date range", value=(ids[0], ids[100]), start=ids[0], end=ids[-1], step=1)
range_select.on_change('value', update)
source = make_data(df, 1, 1000)
p = make_plot(source)
controls = WidgetBox(range_select)
layout = column(controls, p)
tab = Panel(child=layout, title = 'Histogram')
tabs = Tabs(tabs = [tab])
show(tabs)
can someone please point me in the right direction here

How can I speed up my python program?

I was trying to make a program in Python 3.4.1 to obtain the prime numbers from 2 to 100,000.
My problem is that it takes too much time to process all the information and it never give me any result.
I had left it for around half an hour, it slows me all the computer and it doesn't give me what I want.
I am using the Eratosthenes' Sieve algorithm "Criba de Eratostenes".
Here is my code:
from math import *
def primos(num):
num2 = num + 1
tnumeros = [] # tnumeros = every number from 2 to num
npnumeros= [] # npnumeros = every number that is no prime
pnumeros = [] # pnumeros = every prime number
for a in range( 2, num2 ):
tnumeros.append( a )
for i in range( 2, int( sqrt( num ) ) + 1 ):
for j in range( i, int( num / i ) + 1 ):
np = i * j
npnumeros.append( np )
npnumeros = list( set( npnumeros ) )
for e in tnumeros:
if ( e in npnumeros ):
continue
else:
pnumeros.append( e )
return ( str( "".join( str( pnumeros ) ) ) )
print( primos( 100000 ) )
Don't use a list for your npnumeros value; use a set instead. You're only interested in looking up whether a number is in that collection, so make it a set from the start:
npnumeros = set()
# ...
for i in range( 2, int( sqrt( num ) ) + 1 ):
for j in range( i, int( num / i ) + 1 ):
np = i * j
npnumeros.add( np )
# npnumeros = list( set( npnumeros ) ) # Remove this line, it's no longer needed
for e in tnumeros:
if ( e in npnumeros ):
continue
else:
pnumeros.append( e )
The reason your code is slow is that looking up numbers in a list is O(N) time, and doing that inside an O(N) loop is O(N^2) time. But looking up numbers in a set is O(1) time, so you'll have O(N) time inside that loop. Going from O(N^2) to O(N) is going to represent a HUGE difference in processing speed.
If you don't understand the O(N) notation I used, Google "Big O notation" to read more about it.
This is a severly truncated answer, due to the fact this question should probably be moved to CR.
One quick speed up is simply leaving npnumeros as a set instead of a list. What that means is that the later computation if ( e in npnumeros ): will happen sigifcantly faster.
The modified code:
from math import *
def primos(num):
num2 = num + 1
tnumeros = [] # tnumeros = every number from 2 to num
npnumeros= [] # npnumeros = every number that is no prime
pnumeros = [] # pnumeros = every prime number
for a in range( 2, num2 ):
tnumeros.append( a )
for i in range( 2, int( sqrt( num ) ) + 1 ):
for j in range( i, int( num / i ) + 1 ):
np = i * j
npnumeros.append( np )
npnumeros = set( npnumeros )
for e in tnumeros:
if ( e in npnumeros ):
continue
else:
pnumeros.append( e )
return ( str( "".join( str( pnumeros ) ) ) )
print( primos( 100000 ) )
runs ~60 times faster.

Resources