Scrape gmail for selected content and get attachments - python-3.x

I have a problem to scrape gmail.
Candidates should extract or pull information relating to financial transactions from Gmail. The information could be invoices, subscription alerts, bills, etc. We want you to connect with a Gmail account and scrape or pull data of invoices, subscriptions, upcoming bills. You can scrape the emails for words like upcoming invoice or subscription or invoice etc and pull the
amount, date, attachment if any all these details.
I have to collect information and also store all the attachments. Is there any specific simple way to do it?
my Code
import imaplib
import os
import email, getpass
import sys
import json
class GmailFinin():
def helloWorld(self):
print("\nHello I'm here to help you")
def initializeVariables(self):
self.usr = ""
self.pwd = ""
self.mail = object
self.mailbox = ""
self.mailCount = 0
self.destFolder = ""
self.data = []
self.ids = []
self.idsList = []
def getLogin(self):
print("\nPlease enter your Gmail login details below.")
self.usr = input("Email: ")
# self.pwd = input("Password: ")
self.pwd = getpass.getpass("Enter your password --> ")
def attemptLogin(self):
self.mail = imaplib.IMAP4_SSL("imap.gmail.com", 993)
if self.mail.login(self.usr, self.pwd):
print("\nLogon SUCCESSFUL")
self.destFolder = input("\nPlease choose a destination folder in the form of /Users/username/dest/ (do not forget trailing slash!): ")
if not self.destFolder.endswith("/"): self.destFolder+="/"
return True
else:
print("\nLogon FAILED")
return False
def checkIfUsersWantsToContinue(self):
print("\nWe have found "+str(self.mailCount)+" emails in the mailbox "+self.mailbox+".")
return True if input("Do you wish to continue extracting all the emails into "+self.destFolder+"? (y/N) ").lower().strip()[:1] == "y" else False
def selectMailbox(self):
# self.mailbox = input("\nPlease type the name of the mailbox you want to extract, e.g. Inbox: ")
self.mailbox = "Inbox"
bin_count = self.mail.select(self.mailbox)[1]
self.mailCount = int(bin_count[0].decode("utf-8"))
return True if self.mailCount > 0 else False
def searchThroughMailbox(self):
type, self.data = self.mail.search(None, "ALL")
self.ids = self.data[0]
self.idsList = self.ids.split()
def parseEmails(self):
jsonOutput = {}
for anEmail in self.data[0].split():
type, self.data = self.mail.fetch(anEmail, '(UID RFC822)')
raw = self.data[0][1]
try:
raw_str = raw.decode("utf-8")
except UnicodeDecodeError:
try:
raw_str = raw.decode("ISO-8859-1") # ANSI support
except UnicodeDecodeError:
try:
raw_str = raw.decode("ascii") # ASCII ?
except UnicodeDecodeError:
pass
msg = email.message_from_string(raw_str)
jsonOutput['subject'] = msg['subject']
jsonOutput['from'] = msg['from']
jsonOutput['date'] = msg['date']
raw = self.data[0][0]
raw_str = raw.decode("utf-8")
uid = raw_str.split()[2]
# Body #
if msg.is_multipart():
for part in msg.walk():
partType = part.get_content_type()
## Get Body ##
if partType == "text/plain" and "attachment" not in part:
jsonOutput['body'] = part.get_payload()
## Get Attachments ##
if part.get('Content-Disposition') is not None:
attchName = part.get_filename()
print(attchName)
if bool(attchName):
attchFilePath = str(self.destFolder)+str(uid)+str("/")+str(attchName)
print(attchFilePath)
os.makedirs(os.path.dirname(attchFilePath), exist_ok=True)
with open(attchFilePath, "wb") as f:
f.write(part.get_payload(decode=True))
else:
# jsonOutput['body'] = msg.get_payload(decode=True).decode("utf-8") # Non-multipart email, perhaps no attachments or just text.
jsonOutput['body'] = msg.get_payload()
outputDump = json.dumps(jsonOutput)
emailInfoFilePath = str(self.destFolder)+str(uid)+str("/")+str(uid)+str(".json")
os.makedirs(os.path.dirname(emailInfoFilePath), exist_ok=True)
print(emailInfoFilePath)
with open(emailInfoFilePath, "w") as f:
f.write(outputDump)
def __init__(self):
self.initializeVariables()
self.helloWorld()
self.getLogin()
if self.attemptLogin():
not self.selectMailbox() and sys.exit()
else:
sys.exit()
not self.checkIfUsersWantsToContinue() and sys.exit()
self.searchThroughMailbox()
self.parseEmails()
if __name__ == "__main__":
run = GmailFinin()
I have tried using below for search, but I don't think i.e optimal because it is searching only in subject and How to add multiple or condition for list of keywords.
type, self.data = self.mail.search(None, '(OR TEXT "bill" SUBJECT "bill")')

Related

Is it possible to change the output so that "Arlene" and "Klusman" don't have an extra set of parenthesis around them?

I'm writing code for an assignment where I can't change the main. The way I have it written, it prints like in the screenshot below:
Here is my code:
import csv
class Customer:
def __init__(self, cust_id, name, lastName, companyName, address, city, state, cust_zip):
self.cust_id = cust_id
self.first_name = name
self.last_name = lastName
self.company_name = companyName
self.address = address
self.city = city
self.state = state
self.zip = cust_zip
def getFullName(self):
return(self.first_name, self.last_name)
def getFullAddress(self):
return(self.getFullName(), self.company_name, self.address, self.city, self.state, self.zip)
def get_customers():
myList = []
counter = 0
with open("customers.csv", "r") as csv_file:
reader = csv.reader(csv_file, delimiter = ",")
for row in reader:
if counter!=0:
customer1 = Customer(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7])
myList.append(customer1)
counter+=1
return myList
def find_customer_by_id(customers, cust_id):
for i in range(len(customers)):
if cust_id == customers[i].cust_id:
return customers[i]
return None
def main():
#main is fully implemented with no modification expected
print("Customer Viewer")
print()
customers = get_customers()
while True:
cust_id = input("Enter customer ID: ").strip()
print()
customer = find_customer_by_id(customers, cust_id)
if customer == None:
print("No customer with that ID.")
print()
else:
print(customer.getFullAddress())
print()
again = input("Continue? (y/n): ").lower()
print()
if again != "y":
break
print("Bye!")
if __name__ == "__main__":
main()
Why are there parenthesis and, can you get rid of them?
I tried to different approaches but nothing changed the output in the intended way

Why does the body part of email is not found Python imap?

Found an article about reading email with Python. The sender of the letter and the subject are found but there are problems with body of the mail - it is not found. Maybe there are some other ways to read body?
import imaplib
import email
from email.header import decode_header
import webbrowser
import os
# account credentials
username = "username#mail.ru"
password = "password"
def clean(text):
return "".join(c if c.isalnum() else "_" for c in text)
imap = imaplib.IMAP4_SSL("imap.mail.ru")
imap.login(username, password)
# it is spam folder id
status, messages = imap.select("&BCEEPwQwBDw-")
messages = int(messages[0])
for i in range(messages, 0, -1):
res, msg = imap.fetch(str(i), "(RFC822)")
for response in msg:
if isinstance(response, tuple):
msg = email.message_from_bytes(response[1])
subject, encoding = decode_header(msg["Subject"])[0]
if isinstance(subject, bytes):
subject = subject.decode(encoding)
From, encoding = decode_header(msg.get("From"))[0]
if isinstance(From, bytes):
From = From.decode(encoding)
print("Subject:", subject)
print("From:", From)
if msg.is_multipart():
for part in msg.walk():
# extract content type of email
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
try:
body = part.get_payload(decode=True).decode()
except:
pass
if content_type == "text/plain" and "attachment" not in content_disposition:
print(body)
elif "attachment" in content_disposition:
filename = part.get_filename()
if filename:
folder_name = clean(subject)
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
filepath = os.path.join(folder_name, filename)
open(filepath, "wb").write(part.get_payload(decode=True))
else:
content_type = msg.get_content_type()
body = msg.get_payload(decode=True).decode()
if content_type == "text/plain":
print(body)
if content_type == "text/html":
folder_name = clean(subject)
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
filename = "index.html"
filepath = os.path.join(folder_name, filename)
open(filepath, "w").write(body)
webbrowser.open(filepath)
imap.close()
imap.logout()
For some reason, the body is not located and because of this, an error appears.
NameError: name 'body' is not defined
body is not defined because of this:
try:
body = part.get_payload(decode=True).decode()
except:
pass
You tried to define body, but failed due to some kind of error that you allowed to pass silently (hint: don't do that!). Then the next blocks of code assumed that body had already been assigned when it hadn't.
Solution: define body outside of the try/except clause:
body = None
try:
body = part.get_payload(decode=True).decode()
except:
# pass
import traceback
traceback.print_exception()
# this way you'll at least know what the error was

How do I make the script send one notification when it shows that an item is in stock?

As is the code checks if the links in the .txt file are in or out of stock. If the product is in stock a notification will be sent to the recipient(s) every x amount of minutes. How do I make the script send one notification for the item in stock? Then when it scrapes Walmart in the next five minutes how do I skip the url in the list of the product that is available? or in general how do improve the script?
#!/usr/bin/env python3
import smtplib
import sys
import requests
import schedule
import time
import datetime
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from bs4 import BeautifulSoup
from termcolor import cprint, colored
from colorama import init
#------------------------------------------------ All Modules required for script are above
with open('url.txt', 'r') as f: #Opens url.txt file and reads line by line
urls = [line.strip() for line in f]
def main():
for u in urls:
response = requests.get(u)
data = response.text
soup = BeautifulSoup(data,'lxml')
out_stock = soup.find("span", class_="display-block-xs font-bold")
in_stock = not out_stock
title = colored(soup.find(class_="prod-ProductTitle prod-productTitle-buyBox font-bold").get_text(u), "white", attrs=['bold']) # Finds the title of item
product_message = "Item in stock"
cprint("----------------------------", "blue")
cprint("Product Name: " + title, "white", attrs=['reverse']) # Gives you the title name
if(in_stock): #Checks if item is in stock
cprint(product_message, "green", attrs=['reverse'])
elif(out_stock): #Checks if item is out of stock
cprint(out_stock.get_text(), "red", attrs=['reverse'])
else:
print("ERROR")
def sendemail(): #Email and Text message alert notification
for u in urls:
response = requests.get(u)
data = response.text
soup = BeautifulSoup(data,'lxml')
#Login here and conncects to the server
email_sender_account = ""
email_sender_username = ""
email_sender_password = ""
email_smtp_server = "smtp.gmail.com"
email_smtp_port = 587
#Sender
email_recipients = ["", ""] # Add email or number here in format: "",
email_subject = "Phone notification alert"
email_body = "Product is in stock: " + u
server = smtplib.SMTP(email_smtp_server,email_smtp_port)
server.starttls()
server.login(email_sender_username, email_sender_password)
for recipient in email_recipients:
message = MIMEMultipart('alternative')
message['From'] = email_sender_account
message['To'] = recipient
message['Subject'] = email_subject
message.attach(MIMEText(email_body))
text = message.as_string()
server.sendmail(email_sender_account,recipient,text)
server.quit()
def restart():
current_time = datetime.datetime.now().strftime("%A, %B %d, %H:%M")
print("\n" * 100)
print()
cprint("------------------------------", "blue")
cprint("Date: " + current_time, "white", attrs=['bold'])
main()
schedule.every(5).minutes.do(restart)
while 1:
schedule.run_pending()
time.sleep(1)
main()
To alert the user only once if a item comes in stock, store the in-stock items in a list.
When an item is in stock, check the in-stock list
If url not in list, add url to list and send alert
else skip alert
Here is the updated main():
lstInStock = [] # items previously found in stock
def main():
for u in urls:
..........
if(in_stock): #Checks if item is in stock
cprint(product_message, "green", attrs=['reverse'])
if not u in lstInStock: # first time in stock
lstInStock.append(u) # skip next time
sendemail() #send alert
elif(out_stock): #Checks if item is out of stock
cprint(out_stock.get_text(), "red", attrs=['reverse'])
else:
print("ERROR")
Note that this will send alerts for every in-stock item on the first run since lstInStock is empty.

Adding Lyrics to the search results for the billboard.py python package

guoguo12 has written a very handy billboard.com parser. However, I would like to add the a feature that also returns lyrics in addition to all of the other very helpful information which the package provides. However, I am a noob and have no idea what I am doing.
I believe that this is the piece of code from billboard's html which would give access to the lyrics
<a class="chart-element__information__lyrics show-more__hidden-el" title="Read Lyrics of Blinding Lights by The Weeknd" href="https://www.billboard.com/articles/news/lyrics/8545919/the-weeknd-blinding-lights-lyrics" target="_blank">Song Lyrics</a>
Here is the python code for billboard.py
#!/usr/bin/env python
import datetime
import json
import re
import sys
from bs4 import BeautifulSoup
import requests
"""billboard.py: Unofficial Python API for accessing music charts from Billboard.com."""
__author__ = "Allen Guo"
__license__ = "MIT"
__maintainer__ = "Allen Guo"
__email__ = "guoguo12#gmail.com"
# css selector constants
_CHART_NAME_SELECTOR = 'meta[name="title"]'
_DATE_ELEMENT_SELECTOR = "button.chart-detail-header__date-selector-button"
_PREVIOUS_DATE_SELECTOR = "span.fa-chevron-left"
_NEXT_DATE_SELECTOR = "span.fa-chevron-right"
_ENTRY_LIST_SELECTOR = "div.chart-list-item"
_ENTRY_TITLE_ATTR = "data-title"
_ENTRY_ARTIST_ATTR = "data-artist"
_ENTRY_IMAGE_SELECTOR = "img.chart-list-item__image"
_ENTRY_RANK_ATTR = "data-rank"
# constants for the getMinistatsCellValue helper function
_MINISTATS_CELL = "div.chart-list-item__ministats-cell"
_MINISTATS_CELL_HEADING = "span.chart-list-item__ministats-cell-heading"
class BillboardNotFoundException(Exception):
pass
class BillboardParseException(Exception):
pass
class ChartEntry:
"""Represents an entry (typically a single track) on a chart.
Attributes:
title: The title of the track.
artist: The name of the track artist, as formatted on Billboard.com.
If there are multiple artists and/or featured artists, they will
be included in this string.
image: The URL of the image for the track.
peakPos: The track's peak position on the chart as of the chart date,
as an int (or None if the chart does not include this information).
lastPos: The track's position on the previous week's chart, as an int
(or None if the chart does not include this information).
This value is 0 if the track was not on the previous week's chart.
weeks: The number of weeks the track has been or was on the chart,
including future dates (up until the present time).
rank: The track's position on the chart, as an int.
isNew: Whether the track is new to the chart, as a boolean.
"""
def __init__(self, title, artist, image, peakPos, lastPos, weeks, rank, isNew):
self.title = title
self.artist = artist
self.image = image
self.peakPos = peakPos
self.lastPos = lastPos
self.weeks = weeks
self.rank = rank
self.isNew = isNew
def __repr__(self):
return "{}.{}(title={!r}, artist={!r})".format(
self.__class__.__module__, self.__class__.__name__, self.title, self.artist
)
def __str__(self):
"""Returns a string of the form 'TITLE by ARTIST'.
"""
if self.title:
s = u"'%s' by %s" % (self.title, self.artist)
else:
s = u"%s" % self.artist
if sys.version_info.major < 3:
return s.encode(getattr(sys.stdout, "encoding", "") or "utf8")
else:
return s
def json(self):
"""Returns the entry as a JSON string.
This is useful for caching.
"""
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
class ChartData:
"""Represents a particular Billboard chart for a particular date.
Attributes:
name: The chart name, as a string.
title: The human-readable chart name, as a string.
date: The date of the chart.
previousDate: The date of the previous chart, as a string in YYYY-MM-DD
format, or None if this information was not available.
entries: A list of ChartEntry objects, ordered by position on the chart
(highest first).
"""
def __init__(self, name, date=None, fetch=True, max_retries=5, timeout=25):
"""Constructs a new ChartData instance.
Args:
name: The chart name, e.g. 'hot-100' or 'pop-songs'.
date: The chart date, as a string in YYYY-MM-DD format.
By default, the latest chart is fetched.
If the argument is not a date on which a chart was published,
Billboard automatically rounds dates up to the nearest date on
which a chart was published.
If this argument is invalid, no exception will be raised;
instead, the chart will contain no entries.
fetch: A boolean indicating whether to fetch the chart data from
Billboard.com immediately (at instantiation time).
If False, the chart data can be populated at a later time
using the fetchEntries() method.
max_retries: The max number of times to retry when requesting data
(default: 5).
timeout: The number of seconds to wait for a server response.
If None, no timeout is applied.
"""
self.name = name
if date is not None:
if not re.match("\d{4}-\d{2}-\d{2}", str(date)):
raise ValueError("Date argument is not in YYYY-MM-DD format")
try:
datetime.datetime(*(int(x) for x in str(date).split("-")))
except:
raise ValueError("Date argument is invalid")
self.date = date
self.title = ""
self.previousDate = None
self._max_retries = max_retries
self._timeout = timeout
self.entries = []
if fetch:
self.fetchEntries()
def __repr__(self):
return "{}.{}({!r}, date={!r})".format(
self.__class__.__module__, self.__class__.__name__, self.name, self.date
)
def __str__(self):
"""Returns the chart as a human-readable string (typically multi-line).
"""
if not self.date:
s = "%s chart (current)" % self.name
else:
s = "%s chart from %s" % (self.name, self.date)
s += "\n" + "-" * len(s)
for n, entry in enumerate(self.entries):
s += "\n%s. %s" % (entry.rank, str(entry))
return s
def __getitem__(self, key):
"""Returns the (key + 1)-th chart entry; i.e., chart[0] refers to the
top entry on the chart.
"""
return self.entries[key]
def __len__(self):
"""Returns the number of entries in the chart.
A length of zero may indicated a failed/bad request.
"""
return len(self.entries)
def json(self):
"""Returns the entry as a JSON string.
This is useful for caching.
"""
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
def _parseOldStylePage(self, soup):
dateElement = soup.select_one(_DATE_ELEMENT_SELECTOR)
if dateElement:
dateText = dateElement.text.strip()
curDate = datetime.datetime.strptime(dateText, "%B %d, %Y")
self.date = curDate.strftime("%Y-%m-%d")
prevWeek = soup.select_one(_PREVIOUS_DATE_SELECTOR)
nextWeek = soup.select_one(_NEXT_DATE_SELECTOR)
if prevWeek and prevWeek.parent.get("href"):
self.previousDate = prevWeek.parent.get("href").split("/")[-1]
if nextWeek and nextWeek.parent.get("href"):
self.nextDate = nextWeek.parent.get("href").split("/")[-1]
for entrySoup in soup.select(_ENTRY_LIST_SELECTOR):
try:
title = entrySoup[_ENTRY_TITLE_ATTR].strip()
except:
message = "Failed to parse title"
raise BillboardParseException(message)
try:
artist = entrySoup[_ENTRY_ARTIST_ATTR].strip() or ""
except:
message = "Failed to parse artist"
raise BillboardParseException(message)
if artist == "":
title, artist = artist, title
try:
imageSoup = entrySoup.select_one(_ENTRY_IMAGE_SELECTOR)
if imageSoup.has_attr("data-src"):
image = imageSoup["data-src"]
else:
image = imageSoup["src"]
except:
message = "Failed to parse image"
raise BillboardParseException(message)
try:
rank = int(entrySoup[_ENTRY_RANK_ATTR].strip())
except:
message = "Failed to parse rank"
raise BillboardParseException(message)
if self.date:
# "Ministats" is the name in the Billboard.com source code for
# the stats under each chart entry
def getMinistatsCellValue(fieldName, ifNoValue=None):
try:
for ministat in entrySoup.select(_MINISTATS_CELL):
heading = ministat.select_one(_MINISTATS_CELL_HEADING)
headingText = heading.string.strip().lower()
if headingText == fieldName:
value = ministat.text.split(u"\xa0")[0].strip()
if value is None or value == "-":
return ifNoValue
else:
return int(value)
return ifNoValue
except Exception as e:
print(e)
message = "Failed to parse ministats cell value: %s" % fieldName
raise BillboardParseException(message)
peakPos = getMinistatsCellValue("peak")
lastPos = getMinistatsCellValue("last", ifNoValue=0)
weeks = getMinistatsCellValue("weeks", ifNoValue=1)
isNew = True if weeks == 1 else False
else:
peakPos = lastPos = weeks = None
isNew = False
entry = ChartEntry(
title, artist, image, peakPos, lastPos, weeks, rank, isNew
)
self.entries.append(entry)
def _parseNewStylePage(self, soup):
dateElement = soup.select_one("button.date-selector__button.button--link")
if dateElement:
dateText = dateElement.text.strip()
curDate = datetime.datetime.strptime(dateText, "%B %d, %Y")
self.date = curDate.strftime("%Y-%m-%d")
self.previousDate = soup.select_one("#charts")["data-previous-chart-date"]
self.nextDate = soup.select_one("#charts")["data-chart-next-date"]
for entrySoup in soup.select("li.chart-list__element"):
def getEntryAttr(selector):
return entrySoup.select_one(selector).text.strip()
try:
title = getEntryAttr("span.chart-element__information__song")
except:
message = "Failed to parse title"
raise BillboardParseException(message)
try:
artist = getEntryAttr("span.chart-element__information__artist") or ""
except:
message = "Failed to parse artist"
raise BillboardParseException(message)
if artist == "":
title, artist = artist, title
# TODO: Parse the image
image = None
try:
rank = int(getEntryAttr("span.chart-element__rank__number"))
except:
message = "Failed to parse rank"
raise BillboardParseException(message)
def getMeta(attribute, ifNoValue=None):
try:
selected = entrySoup.select_one(
"span.chart-element__meta.text--%s" % attribute
)
if (
not selected
or selected.string is None
or selected.string == "-"
):
return ifNoValue
else:
return int(selected.string.strip())
except:
message = "Failed to parse metadata value: %s" % attribute
raise BillboardParseException(message)
if self.date:
peakPos = getMeta("peak")
lastPos = getMeta("last", ifNoValue=0)
weeks = getMeta("week", ifNoValue=1)
isNew = True if weeks == 1 else False
else:
peakPos = lastPos = weeks = None
isNew = False
entry = ChartEntry(
title, artist, image, peakPos, lastPos, weeks, rank, isNew
)
self.entries.append(entry)
def _parsePage(self, soup):
chartTitleElement = soup.select_one(_CHART_NAME_SELECTOR)
if chartTitleElement:
self.title = re.sub(
" Chart$",
"",
chartTitleElement.get("content", "").split("|")[0].strip(),
)
if soup.select("table"):
self._parseOldStylePage(soup)
else:
self._parseNewStylePage(soup)
def fetchEntries(self):
"""GETs the corresponding chart data from Billboard.com, then parses
the data using BeautifulSoup.
"""
if not self.date:
# Fetch latest chart
url = "https://www.billboard.com/charts/%s" % (self.name)
else:
url = "https://www.billboard.com/charts/%s/%s" % (self.name, self.date)
session = _get_session_with_retries(max_retries=self._max_retries)
req = session.get(url, timeout=self._timeout)
if req.status_code == 404:
message = "Chart not found (perhaps the name is misspelled?)"
raise BillboardNotFoundException(message)
req.raise_for_status()
soup = BeautifulSoup(req.text, "html.parser")
self._parsePage(soup)
def charts():
"""Gets a list of all Billboard charts from Billboard.com.
"""
session = _get_session_with_retries(max_retries=5)
req = session.get("https://www.billboard.com/charts", timeout=25)
req.raise_for_status()
soup = BeautifulSoup(req.text, "html.parser")
return [
link["href"].split("/")[-1]
for link in soup.findAll("a", {"class": "chart-panel__link"})
]
def _get_session_with_retries(max_retries):
session = requests.Session()
session.mount(
"https://www.billboard.com",
requests.adapters.HTTPAdapter(max_retries=max_retries),
)
return session
guoguo12 has told me that I need to do this
"Oh, gotcha. You need to write a selector for the a element, grab the URL, download that page, and parse it."
However, I am not sure what the above means (Did I mention that I am a noob?).
My question is, where do I add the selector tag? I think that I can add something like
_chart-element__information__lyrics show-more__hidden-el" title="
under
# css selector constants
But this doesn't seem right.
So I am not sure that this answers the above question specifically, however it does solve the problem that I wanted to solve. As mentioned here, by Shinyhero36; you can use the following code to return lyrics from Billboard:
import requests
from bs4 import BeautifulSoup as Parse
def make_soup(url):
"""
Parse a web page info html
"""
user_agent = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
r = requests.get(url, headers=user_agent)
html = Parse(r.content, "html.parser")
return html
def format_url(string):
"""
Replace les spaces with '%20'
"""
return string.replace(" ", "%20")
def get_song_url(html):
song_url = html.find("a", {"class": "title"})["href"]
return song_url
def find_Lyrics(titre, artiste):
url = f"https://www.musixmatch.com/fr/search/{artiste}%20{titre}/tracks"
url = format_url(url)
pageweb = make_soup(url)
# Recupere le lien de la chanson
song_url = pageweb.find("a", {"class": "title"})["href"]
song_url = "https://www.musixmatch.com" + song_url
# Recupere les paroles
pageweb = make_soup(song_url)
paroles = list()
for span in pageweb.find_all("span", {"class" : "lyrics__content__ok"}):
print(span.text)
find_Lyrics("title","artist")
Replace title and artist with the title and artist of the song you are searching.

Crawling Craiglisht with python (Not Scrapy)

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href=\"(\/d\/[\w\-]+\/\w+\/\w+)\".+txt\">([\w\-\+\s+\/\<]+)<sup class')
next_page_pat = re.compile(
r'<a href=\"\/(.*)\" class=\"button next\" title=\"next\s+page\">next > <\/a>\s+<span class=\"button next\" title=\"next page\">\s+next >\s+<\/span>\s+<\/span>\s+<\/div>\s+<\/div>\s+.+\s+.+')
job_list_pat = re.compile(r'([\w\s*]+)')
desc_pat = re.compile(r'<\/div>\s*<section id=\"postingbody\">.+html\"><\/div>\s*<\/div>(.+)<\/section><ul')
img_pat = re.compile(r'<img src=\"(.*jpg)\" title')
crawl_website()

Resources