python flickrapi to download all albums - flickr

I am using flickrapi to download albums and have written this code:
flickr = flickrapi.FlickrAPI(api_key, api_password, format='parsed-json')
userid = 'XXXXXXXXX'
# auth and token not required anymore, since already cached
#flickr.authenticate_via_browser(perms='read')
#token = flickr.get_request_token()
photos = flickr.photos.search(user_id=userid, per_page='100')
sets = flickr.photosets.getList(user_id=userid)
#print photos
#print sets
total_photos=0
for photoset_index in sets['photosets']['photoset']:
photoset_title = photoset_index['title']['_content']
number_photos = photoset_index['photos']
total_photos=total_photos+number_photos
#print photoset_title,number_photos
total_number_of_albums = len(sets['photosets']['photoset'])
print total_number_of_albums
for photos_index in photos['photos']['photo']:
photo_name = photos_index['title']
photo_id = photos_index['id']
#print photo_name, photo_id
total_number_of_pics = photos['photos']['total']
print total_photos, total_number_of_pics
I can get the name of the album and the number of photos, but how do I access the album pics? The documentation and developer api does not say anything about the download. https://www.flickr.com/services/api/

I found a work around to the problem. You need
sudo pip install flickr_download
The documentation is https://pypi.python.org/pypi/flickr_download
Copy the output of the code in a bash script and run. All the albums would be downloaded.
The below is the code:
api_key = 'GET_IT_FROM_UR_FLICKR_ACCOUNT'
api_secret = 'GET_IT_FROM_UR_FLICKR_ACCOUNT'
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')
flickr.authenticate_via_browser(perms='read')
print flickr.token_cache.token
photos = flickr.photos.search(user_id=userid, per_page='100')
sets = flickr.photosets.getList(user_id=userid)
#print photos
#print sets
total_photos=0
for photoset_index in sets['photosets']['photoset']:
photoset_title = photoset_index['title']['_content']
number_photos = photoset_index['photos']
id = photoset_index['id']
total_photos=total_photos+number_photos
print "flickr_download -k " + api_key + " -s " + api_secret + " -t -d " + id
#print id,photoset_title,number_photos
total_number_of_albums = len(sets['photosets']['photoset'])
print total_number_of_albums
for photos_index in photos['photos']['photo']:
photo_name = photos_index['title']
photo_id = photos_index['id']
#print photo_name, photo_id
total_number_of_pics = photos['photos']['total']
# Cross check pics in photoset and actual number of pics. Should be equal
print total_photos, total_number_of_pics

Related

Linkedin web scraping snippet

I'm doing a web scraping data university research project. I started working on a ready GitHub project, but this project does not retrieve all the data.
The project works like this:
Search Google using keywords: example: (accountant 'email me at' Google)
Extract a snippet.
Retrieve data from this snippet.
The issue is:
The snippets extracted are like this: " ... marketing division in 2009. For more information on career opportunities with our company, email me: vicki#productivedentist.com. Neighborhood Smiles, LLCĀ ..."
The snippet does not show all, the "..." hides information like role, location... How can I retrieve all the information with the script?
from googleapiclient.discovery import build #For using Google Custom Search Engine API
import datetime as dt #Importing system date for the naming of the output file.
import sys
from xlwt import Workbook #For working on xls file.
import re #For email search using regex.
if __name__ == '__main__':
# Create an output file name in the format "srch_res_yyyyMMdd_hhmmss.xls in output folder"
now_sfx = dt.datetime.now().strftime('%Y%m%d_%H%M%S')
output_dir = './output/'
output_fname = output_dir + 'srch_res_' + now_sfx + '.xls'
search_term = sys.argv[1]
num_requests = int(sys.argv[2])
my_api_key = "replace_with_you_api_key" #Read readme.md to know how to get you api key.
my_cse_id = "011658049436509675749:gkuaxghjf5u" #Google CSE which searches possible LinkedIn profile according to query.
service = build("customsearch", "v1", developerKey=my_api_key)
wb=Workbook()
sheet1 = wb.add_sheet(search_term[0:15])
wb.save(output_fname)
sheet1.write(0,0,'Name')
sheet1.write(0,1,'Profile Link')
sheet1.write(0,2,'Snippet')
sheet1.write(0,3,'Present Organisation')
sheet1.write(0,4,'Location')
sheet1.write(0,5,'Role')
sheet1.write(0,6,'Email')
sheet1.col(0).width = 256 * 20
sheet1.col(1).width = 256 * 50
sheet1.col(2).width = 256 * 100
sheet1.col(3).width = 256 * 20
sheet1.col(4).width = 256 * 20
sheet1.col(5).width = 256 * 50
sheet1.col(6).width = 256 * 50
wb.save(output_fname)
row = 1 #To insert the data in the next row.
#Function to perform google search.
def google_search(search_term, cse_id, start_val, **kwargs):
res = service.cse().list(q=search_term, cx=cse_id, start=start_val, **kwargs).execute()
return res
for i in range(0, num_requests):
# This is the offset from the beginning to start getting the results from
start_val = 1 + (i * 10)
# Make an HTTP request object
results = google_search(search_term,
my_cse_id,
start_val,
num=10 #num value can be 1 to 10. It will give the no. of results.
)
for profile in range (0, 10):
snippet = results['items'][profile]['snippet']
myList = [item for item in snippet.split('\n')]
newSnippet = ' '.join(myList)
contain = re.search(r'[\w\.-]+#[\w\.-]+', newSnippet)
if contain is not None:
title = results['items'][profile]['title']
link = results['items'][profile]['link']
org = "-NA-"
location = "-NA-"
role = "-NA-"
if 'person' in results['items'][profile]['pagemap']:
if 'org' in results['items'][profile]['pagemap']['person'][0]:
org = results['items'][profile]['pagemap']['person'][0]['org']
if 'location' in results['items'][profile]['pagemap']['person'][0]:
location = results['items'][profile]['pagemap']['person'][0]['location']
if 'role' in results['items'][profile]['pagemap']['person'][0]:
role = results['items'][profile]['pagemap']['person'][0]['role']
print(title[:-23])
sheet1.write(row,0,title[:-23])
sheet1.write(row,1,link)
sheet1.write(row,2,newSnippet)
sheet1.write(row,3,org)
sheet1.write(row,4,location)
sheet1.write(row,5,role)
sheet1.write(row,6,contain[0])
print('Wrote {} search result(s)...'.format(row))
wb.save(output_fname)
row = row + 1
print('Output file "{}" written.'.format(output_fname))

Parsing Attached .MSG Files with Python3

I'm trying to monitor a phishing inbox that could receive both normal emails (i.e. HTML/text based with potential attachments) as well as emails that have a .MSG file attached to it.
The goal is to have users send emails to phishing#company.com and once I parse out the various links (potentially malicious) as well as attachments (also potentially malicious, I'll perform some analysis on them.
The issue I'm running into is the body of the .msg file that is attached.
With the code below, I'm able to pull the to, from, subject, and all links within the original email. It also pulls down any attachments with the .msg file (i.e. on my test I was able to pull down a PDF within the .msg). However, I cannot get any of the to, from, subject, or body of the .msg file.
When I print it out as raw I get some of it in a very ugly format, but apparently with the multi-parts, I'm doing something wrong to get that piece of information.
I'm fairly new to Python so any help would be greatly appreciated.
import imaplib
import base64
import os
import email
from bs4 import BeautifulSoup
server = 'mail.server.com'
email_user = 'phishing#company.com'
email_pass = 'XXXXXXXXXXXX'
output_dir = '/tmp/attachments/'
body = ""
def get_body(msg):
if msg.is_multipart():
return get_body(msg.get_payload(0))
else:
return msg.get_payload(None, True)
def get_attachments(msg):
for part in msg.walk():
if part.get_content_maintype()=='multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(output_dir, fileName)
with open(filePath,'wb') as f:
f.write(part.get_payload(decode=True))
mail = imaplib.IMAP4_SSL(server)
mail.login(email_user, email_pass)
mail.select('INBOX')
result, data = mail.search(None, 'UNSEEN')
mail_ids = data[0]
id_list = mail_ids.split()
print(id_list)
for emailid in id_list:
result, email_data = mail.fetch(emailid, '(RFC822)')
raw_email = email_data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
email_to = str(email.header.make_header(email.header.decode_header(email_message['To'])))
subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
print('From: ' + email_from)
print('To: ' + email_to)
print('Subject: ' + subject)
get_attachments(raw_email)
for part in email_message.walk():
body = part.get_payload(0)
content = body.get_payload(decode=True)
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('a'):
print('Link: ' + link.get('href'))
break
I got this working with the following code. I basically had to do multiple for loops within the .msg walk and then only pull out the relevant information within the text/html sections.
for emailid in id_list:
result, data = mail.fetch(emailid, '(RFC822)')
raw = email.message_from_bytes(data[0][1])
get_attachments(raw)
#print(raw)
header_from = mail.fetch(emailid, "(BODY[HEADER.FIELDS (FROM)])")
header_from_str = str(header_from)
mail_from = re.search('From:\s.+<(\S+)>', header_from_str)
header_subject = mail.fetch(emailid, "(BODY[HEADER.FIELDS (SUBJECT)])")
header_subject_str = str(header_subject)
mail_subject = re.search('Subject:\s(.+)\'\)', header_subject_str)
#mail_body = mail.fetch(emailid, "(BODY[TEXT])")
print(mail_from.group(1))
print(mail_subject.group(1))
for part in raw.walk():
if part.get_content_type() == 'message/rfc822':
part_string = str(part)
original_from = re.search('From:\s.+<(\S+)>\n', part_string)
original_to = re.search('To:\s.+<(\S+)>\n', part_string)
original_subject = re.search('Subject:\s(.+)\n', part_string)
print(original_from.group(1))
print(original_to.group(1))
print(original_subject.group(1))
if part.get_content_type() == 'text/html':
content = part.get_payload(decode=True)
#print(content)
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('a'):
print('Link: ' + link.get('href'))

Getting the number of old issues and the table (login and number of commits) of the most active members of the repository

I can not get the above information using github.api. Reading the documentation did not help much. There is still no complete understanding of the work with dates. Here is an example of my code for getting open issues:
import requests
import json
from datetime import datetime
username = '\'
password = '\'
another_page = True
opened = 0
closed = 0
api_oldest = 'https://api.github.com/repos/grpc/grpc/issues?
per_page=5&q=sort=created:>`date -v-14d "+%Y-%m-%d"`&order=asc'
api_issue = 'https://api.github.com/repos/grpc/grpc/issues?
page=1&per_page=5000'
api_pulls = 'https://api.github.com/repos/grpc/grpc/pulls?page=1'
datetime.now()
while another_page:
r = requests.get(api_issue, auth=(username, password))
#json_response = json.loads(r.text)
#results.append(json_response)
if 'next' in r.links:
api_issue = r.links['next']['url']
if item['state'] == 'open':
opened += 1
else:
closed += 1
else:
another_page=False
datetime.now()
print(opened)
There are a few issues with your code. For example, what does item represent ?. Your code can be modified as follows to iterate and get the number of open issues .
import requests
username = '/'
password = '/'
another_page = True
opened = 0
closed = 0
api_issue = "https://api.github.com/repos/grpc/grpc/issues?page=1&per_page=5000"
while another_page:
r = requests.get(api_issue, auth=(username, password))
json_response = r.json()
#results.append(json_response)
for item in json_response:
if item['state'] == 'open':
opened += 1
else:
closed += 1
if 'next' in r.links:
api_issue = r.links['next']['url']
else:
another_page=False
print(opened)
If you want issues that were created in the last 14 days, you could make the api request using the following URL.
api_oldest = "https://api.github.com/repos/grpc/grpc/issues?q=sort=created:>`date -d '14 days ago'`&order=asc"

Proper placement of test within try/except loop (Python3)

I use a python script to webscrape for "Show Notes" and an mp3. When I encounter a page that has no show notes, this means the show was a Best Of, so I want to skip the download of the notes and mp3. I am not sure where the best place to insert the test would be. The snippet is as follows:
for show_html in showpage_htmls:
try:
p_html = s.get(show_html)
p_soup = BeautifulSoup(p_html.content, 'html.parser')
# set title for SHOW NOTES
title = ''
title = p_soup.title.contents[0]
# get SHOW NOTES chunk and remove unwanted characters (original mp3notes not changed)
mp3notes = ''
mp3notes = p_soup.find('div', {'class': 'module-text'}).find('div')
mp3notes = str(title) + str('\n') + str(mp3notes).replace('<div>','').replace('<h2>','').replace('</h2>','\n').replace('<p>','').replace('<br/>\n','\n').replace('<br/>','\n').replace('</p>','').replace('</div>','').replace('\u2032','')
# FIXME need to skip d/l if no notes
# set basename, mp3named and mp3showtxt
mp3basename = '{0}{1}{2}'.format(show_html.split('/')[3],show_html.split('/')[4],show_html.split('/')[5])
if (os.name == 'nt'):
mp3showtxt = mp3dir + '\\' + mp3basename + '.txt'
mp3named = mp3dir + '\\' + mp3basename + '.mp3'
else:
mp3showtxt = mp3dir + '/' + mp3basename + '.txt'
mp3named = mp3dir + '/' + mp3basename + '.mp3'
# save show notes to local
with open(mp3showtxt, 'w') as f:
try:
f.write(mp3notes)
print("Show notes " + mp3basename + " saved.")
except UnicodeEncodeError:
print("A charmap encoding ERROR occurred.")
print("Show notes for " + mp3basename + ".mp3 FAILED, but continuing")
finally:
f.close()
# FIXME need eyed3 to set mp3 tags since B&T are lazy
# get Full Show mp3 link
mp3url = p_soup.find('a', href = True, string = 'Full Show').get('href')
# get and save mp3
r = requests.get(mp3url)
with open(mp3named, 'wb') as f:
f.write(r.content)
print("Downloaded " + mp3basename + ".mp3.")
except AttributeError:
print(show_html + " did not exist as named.")
I would think an
if not (len(mp3notes) >= 50)
would work; just not sure where to put it or there is better way (more Pythonic).
Ideally, if the mp3notes are less than expected, no notes or mp3 for that show_html would be saved, and the script would start at the next show_html page.
Since I am new to Python, feel free to offer suggestions to making this more Pythonic as well; I am here to learn! Thanks.

Unload multiple files from Redshift to S3

Hi I am trying to unload multiple tables from Redshift to a particular S3 bucket getting below error:
psycopg2.InternalError: Specified unload destination on S3 is not empty. Consider using a different bucket / prefix, manually removing the target files in S3, or using the ALLOWOVERWRITE option.
if I add 'allowoverwrite' option on unload_function, it is overwritting before table and unloading last table in S3.
This is the code I have given:
import psycopg2
def unload_data(r_conn, aws_iam_role, datastoring_path, region, table_name):
unload = '''unload ('select * from {}')
to '{}'
credentials 'aws_iam_role={}'
manifest
gzip
delimiter ',' addquotes escape parallel off '''.format(table_name, datastoring_path, aws_iam_role)
print ("Exporting table to datastoring_path")
cur = r_conn.cursor()
cur.execute(unload)
r_conn.commit()
def main():
host_rs = 'dataingestion.*********.us******2.redshift.amazonaws.com'
port_rs = '5439'
database_rs = '******'
user_rs = '******'
password_rs = '********'
rs_tables = [ 'Employee', 'Employe_details' ]
iam_role = 'arn:aws:iam::************:role/RedshiftCopyUnload'
s3_datastoring_path = 's3://mysamplebuck/'
s3_region = 'us_*****_2'
print ("Exporting from source")
src_conn = psycopg2.connect(host = host_rs,
port = port_rs,
database = database_rs,
user = user_rs,
password = password_rs)
print ("Connected to RS")
for i, tabe in enumerate(rs_tables):
if tabe[0] == tabe[-1]:
print("No files to read!")
unload_data(src_conn, aws_iam_role = iam_role, datastoring_path = s3_datastoring_path, region = s3_region, table_name = rs_tables[i])
print (rs_tables[i])
if __name__=="__main__":
main()
It is complaining that you are saving the data to the same destination.
This would be like copying all the files on your computer to the same directory -- there will be files overwritten.
You should change your datastoring_path to be different for each table, such as:
.format(table_name, datastoring_path + '/' + table_name, aws_iam_role)

Resources