Trouble returning web scraping output as dictionary - python-3.x

So I am attempting to scrape a website of its staff roster and I want the end product to be a dictionary in the format of {staff: position}. I am currently stuck with it returning every staff name and position as a separate string. It is hard to clearly post the output, but it essentially goes down the list of names, then the position. So for example the first name on the list is to be paired with the first position, and so on. I have determined that each name and position are a class 'bs4.element.Tag. I believe I need to take the names and the positions and make a list out of each, then use zip to put the elements in a dictionary. I have tried implementing this but nothing so far has worked. The lowest I could get to the text I need by using the class_ parameter was the individual div that the p is contained in. I am still inexperienced with python and new to web scraping, but I am relativity well versed with html and css, so help would be greatly appreciated.
# Simple script attempting to scrape
# the staff roster off of the
# Greenville Drive website
import requests
from bs4 import BeautifulSoup
URL = 'https://www.milb.com/greenville/ballpark/frontoffice'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
staff = soup.find_all('div', class_='l-grid__col l-grid__col--xs-12 l-grid__col--sm-4 l-grid__col--md-3 l-grid__col--lg-3 l-grid__col--xl-3')
for staff in staff:
data = staff.find('p')
if data:
print(data.text.strip())
position = soup.find_all('div', class_='l-grid__col l-grid__col--xs-12 l-grid__col--sm-4 l-grid__col--md-6 l-grid__col--lg-6 l-grid__col--xl-6')
for position in position:
data = position.find('p')
if data:
print(data.text.strip())
# This code so far provides the needed data, but need it in a dict()

BeautifulSoup has find_next() which can be used to get the next tag with the matching filters specified. Find the "staff" div and the use find_next() to get the adjacent "position" div.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.milb.com/greenville/ballpark/frontoffice'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
staff_class = 'l-grid__col l-grid__col--xs-12 l-grid__col--sm-4 l-grid__col--md-3 l-grid__col--lg-3 l-grid__col--xl-3'
position_class = 'l-grid__col l-grid__col--xs-12 l-grid__col--sm-4 l-grid__col--md-6 l-grid__col--lg-6 l-grid__col--xl-6'
result = {}
for staff in soup.find_all('div', class_=staff_class):
data = staff.find('p')
if data:
staff_name = data.text.strip()
postion_div = staff.find_next('div', class_=position_class)
postion_name = postion_div.text.strip()
result[staff_name] = postion_name
print(result)
Output
{'Craig Brown': 'Owner/Team President', 'Eric Jarinko': 'General Manager', 'Nate Lipscomb': 'Special Advisor to the President', 'Phil Bargardi': 'Vice President of Sales', 'Jeff Brown': 'Vice President of Marketing', 'Greg Burgess, CSFM': 'Vice President of Operations/Grounds', 'Jordan Smith': 'Vice President of Finance', 'Ned Kennedy': 'Director of Inside Sales', 'Patrick Innes': 'Director of Ticket Operations', 'Micah Gold': 'Senior Account Executive', 'Molly Mains': 'Senior Account Executive', 'Houghton Flanagan': 'Account Executive', 'Jeb Maloney': 'Account Executive', 'Olivia Adams': 'Inside Sales Representative', 'Tyler Melson': 'Inside Sales Representative', 'Toby Sandblom': 'Inside Sales Representative', 'Katie Batista': 'Director of Sponsorships and Community Engagement', 'Matthew Tezza': 'Sponsor Services and Activations Manager', 'Melissa Welch': 'Sponsorship and Community Events Manager', 'Beth Rusch': 'Director of West End Events', 'Kristin Kipper': 'Events Manager', 'Grant Witham': 'Events Manager', 'Alex Guest': 'Director of Game Entertainment & Production', 'Lance Fowler': 'Director of Video Production', 'Davis Simpson': 'Director of Media and Creative Services', 'Cameron White': 'Media Relations Manager', 'Ed Jenson': 'Broadcaster', 'Adam Baird': 'Accountant', 'Mike Agostino': 'Director of Food and Beverage', 'Roger Campana': 'Assistant Director of Food and Beverage', 'Wilbert Sauceda': 'Executive Chef', 'Elise Parish': 'Premium Services Manager', 'Timmy Hinds': 'Director of Facility Operations', 'Zack Pagans': 'Assistant Groundskeeper', 'Amanda Medlin': 'Business and Team Operations Manager', 'Allison Roedell': 'Office Manager'}

Solution using CSS selectors and zip():
import requests
from bs4 import BeautifulSoup
url = 'https://www.milb.com/greenville/ballpark/frontoffice'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = {}
for name, position in zip( soup.select('div:has(+ div p) b'),
soup.select('div:has(> div b) + div p')):
out[name.text] = position.text
from pprint import pprint
pprint(out)
Prints:
{'Adam Baird': 'Accountant',
'Alex Guest': 'Director of Game Entertainment & Production',
'Allison Roedell': 'Office Manager',
'Amanda Medlin': 'Business and Team Operations Manager',
'Beth Rusch': 'Director of West End Events',
'Brady Andrews': 'Assistant Director of Facility Operations',
'Brooks Henderson': 'Merchandise Manager',
'Bryan Jones': 'Facilities Cleanliness Manager',
'Cameron White': 'Media Relations Manager',
'Craig Brown': 'Owner/Team President',
'Davis Simpson': 'Director of Media and Creative Services',
'Ed Jenson': 'Broadcaster',
'Elise Parish': 'Premium Services Manager',
'Eric Jarinko': 'General Manager',
'Grant Witham': 'Events Manager',
'Greg Burgess, CSFM': 'Vice President of Operations/Grounds',
'Houghton Flanagan': 'Account Executive',
'Jeb Maloney': 'Account Executive',
'Jeff Brown': 'Vice President of Marketing',
'Jenny Burgdorfer': 'Director of Merchandise',
'Jordan Smith ': 'Vice President of Finance',
'Katie Batista': 'Director of Sponsorships and Community Engagement',
'Kristin Kipper': 'Events Manager',
'Lance Fowler': 'Director of Video Production',
'Matthew Tezza': 'Sponsor Services and Activations Manager',
'Melissa Welch': 'Sponsorship and Community Events Manager',
'Micah Gold': 'Senior Account Executive',
'Mike Agostino': 'Director of Food and Beverage',
'Molly Mains': 'Senior Account Executive',
'Nate Lipscomb': 'Special Advisor to the President',
'Ned Kennedy': 'Director of Inside Sales',
'Olivia Adams': 'Inside Sales Representative',
'Patrick Innes': 'Director of Ticket Operations',
'Phil Bargardi': 'Vice President of Sales',
'Roger Campana': 'Assistant Director of Food and Beverage',
'Steve Seman': 'Merchandise / Ticketing Advisor',
'Timmy Hinds': 'Director of Facility Operations',
'Toby Sandblom': 'Inside Sales Representative',
'Tyler Melson': 'Inside Sales Representative',
'Wilbert Sauceda': 'Executive Chef',
'Zack Pagans': 'Assistant Groundskeeper'}

Related

Creating a function for my python web scraper that will output a dictionary

I have created my web scraper I have added an function unfortunately my function is not calling the out put is not coming out as a dictionary. How do I create and call the function and store the output as a dictionary. Below is my code and function so far.
from bs4 import BeautifulSoup
import requests
top_stories = []
def get_stories():
""" user agent to facilitates end-user interaction with web content"""
headers = {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
}
base_url = 'www.example.com'
source = requests.get(base_url).text
soup = BeautifulSoup(source, 'html.parser')
articles = soup.find_all("article", class_="card")
print(f"Number of articles found: {len(articles)}")
for article in articles:
try:
headline = article.h3.text.strip()
link = base_url + article.a['href']
text = article.find("div", class_="field--type-text-with-summary").text.strip()
img_url = base_url + article.picture.img['data-src']
print(headline,link,text,img_url)
stories_dict = {}
stories_dict['Headline'] = headline
stories_dict['Link'] = link
stories_dict['Text'] = text
stories_dict['Image'] = img_url
top_stories.append(stories_dict)
except AttributeError as ex:
print('Error:',ex)
get stories()
To get the data in a dictionary format (dict), you can create a dictionary as follows:
top_stories = {"Headline": [], "Link": [], "Text": [], "Image": []}
and append the correct data to it.
(by the way, when you have specified your headers, it should have been a dict not a set.)
from bs4 import BeautifulSoup
import requests
def get_stories():
"""user agent to facilitates end-user interaction with web content"""
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36"
}
top_stories = {"Headline": [], "Link": [], "Text": [], "Image": []}
base_url = "https://www.jse.co.za/"
source = requests.get(base_url, headers=headers).text
soup = BeautifulSoup(source, "html.parser")
articles = soup.find_all("article", class_="card")
print(f"Number of articles found: {len(articles)}")
for article in articles:
try:
top_stories["Headline"].append(article.h3.text.strip())
top_stories["Link"].append(base_url + article.a["href"])
top_stories["Text"].append(
article.find("div", class_="field--type-text-with-summary").text.strip()
)
top_stories["Image"].append(base_url + article.picture.img["data-src"])
except AttributeError as ex:
print("Error:", ex)
print(type(top_stories))
print(top_stories)
get_stories()
Output:
Number of articles found: 6
<class 'dict'>
{'Headline': ['South Africa offers investment opportunities to Asia Pacific investors', 'South Africa to showcase investment opportunities to the UAE market', 'South Africa to showcase investment opportunities to UK investors', 'JSE to become 100% owner of JSE Investor Services and expands services to include share plan administration services', 'Thungela Resources lists on the JSE after unbundling from Anglo American', 'JSE welcomes SAB’s B-BBEE scheme that gives investors exposure to AB InBev global market'], 'Link': ['https://www.jse.co.za//news/market-news/south-africa-offers-investment-opportunities-asia-pacific-investors', 'https://www.jse.co.za//news/market-news/south-africa-showcase-investment-opportunities-uae-market', 'https://www.jse.co.za//news/market-news/south-africa-showcase-investment-opportunities-uk-investors', 'https://www.jse.co.za//news/market-news/jse-become-100-owner-jse-investor-services-and-expands-services-include-share-plan', 'https://www.jse.co.za//news/market-news/thungela-resources-lists-jse-after-unbundling-anglo-american', 'https://www.jse.co.za//news/market-news/jse-welcomes-sabs-b-bbee-scheme-gives-investors-exposure-ab-inbev-global-market'], 'Text': ['The Johannesburg Stock Exchange (JSE) and joint sponsors, Citi and Absa Bank are collaborating to host the annual SA Tomorrow Investor conference, which aims to showcase the country’s array of investment opportunities to investors in the Asia Pacific region, mainly from Hong Kong and Singapore.', 'The Johannesburg Stock Exchange (JSE) and joint sponsors, Citi and Absa Bank are collaborating to host the SA Tomorrow Investor conference, which aims to position South Africa as a preferred investment destination for the United Arab Emirates (UAE) market.', 'The Johannesburg Stock Exchange (JSE) and joint sponsors Citi and Absa Bank are collaborating to host the annual SA Tomorrow Investor conference, which aims to showcase the country’s array of investment opportunities to investors in the United Kingdom.', 'The Johannesburg Stock Exchange (JSE) is pleased to announce that it has embarked on a process to incorporate JSE Investor Services Proprietary Limited (JIS) as a wholly owned subsidiary of the JSE by acquiring the minority shareholding of 25.15 % from LMS Partner Holdings.', 'Shares in Thungela Resources, a South African thermal coal exporter, today commenced trading on the commodity counter of the Main Board of the Johannesburg Stock Exchange (JSE).', 'From today, Black South African retail investors will get the opportunity to invest in the world’s largest beer producer, AB InBev, following the listing of SAB Zenzele Kabili on the Johannesburg Stock Exchange’s (JSE) Empowerment Segment.'], 'Image': ['https://www.jse.co.za//sites/default/files/styles/standard_lg/public/medial/images/2021-06/Web_Banner_0.jpg?h=4ae650de&itok=hdGEy5jA', 'https://www.jse.co.za//sites/default/files/styles/standard_lg/public/medial/images/2021-06/Web_Banner2.jpg?h=4ae650de&itok=DgPFtAx8', 'https://www.jse.co.za//sites/default/files/styles/standard_lg/public/medial/images/2021-06/Web_Banner.jpg?h=4ae650de&itok=Q0SsPtAz', 'https://www.jse.co.za//sites/default/files/styles/standard_lg/public/medial/images/2020-12/DSC_0832.jpg?h=156fdada&itok=rL3M2gpn', 'https://www.jse.co.za//sites/default/files/styles/standard_lg/public/medial/images/2021-06/Thungela_Web_Banner_1440x390.jpg?h=4ae650de&itok=kKRO5fQk', 'https://www.jse.co.za//sites/default/files/styles/standard_lg/public/medial/images/2021-05/SAB-Zenzele.jpg?h=4ae650de&itok=n9osAP33']}

How can i get the asin of this string using regex?

Hi I have the following string, how can I get the B000RMTGUQ of this entire string using regex pytnon?
{'asin': 'B000RMTGUQ', 'imUrl': 'http://ecx.images-amazon.com/images/I/515KlX4dEUL._BO2,204,203,200_PIsitb-sticker-v3-big,TopRight,0,-55_SX278_SY278_PIkin4,BottomRight,1,22_AA300_SH20_OU01_.jpg', 'related': {'also_bought': ['B009QJMXI8', 'B00CNQ7MJG']}, 'categories': [['Books', 'History', 'World', 'Jewish', 'Holocaust'], ['Books', 'History', 'World', 'Religious', 'Judaism'], ['Books', 'Politics & Social Sciences', 'Social Sciences'], ['Books', 'Religion & Spirituality', 'Judaism'], ['Kindle Store', 'Kindle eBooks', 'History', 'World', 'Jewish', 'Holocaust'], ['Kindle Store', 'Kindle eBooks', 'Politics & Social Sciences', 'Social Sciences'], ['Kindle Store', 'Kindle eBooks', 'Religion & Spirituality', 'Judaism']], 'description': "Kibbutz Buchenwald was founded in Germany in 1945 by 16 survivors of Buchenwald concentration camp. The Zionist training farm was organized to prepare Jews for emigration to Palestine. One of the founders was Yeohezkel Tydor, the author's father, who died in 1993. Baumel's narration of the kibbutz's history is divided into two sections. Part one examines the kibbutz from its creation until the departure of the founding group to Palestine in late summer 1945. Part two traces the kibbutz's subsequent history in Palestine and Germany, from the autumn of 1945 until the mid-1950s. Kibbutz Buchenwald was abolished in Germany in 1948; the kibbutz as it was founded in what is now Israel--named Netzer Sereni--still exists today. The story of these pioneers and their physical, psychological, ideological, and political struggles forms the nucleus of this absorbing book.George Cohen"}
You could use json to parse your string to a valid dictionary:
First note that a valid json is enclosed by double quotes. Also, note that author"s etc need single quotes. Hence you could do:
import json, re
dct = json.loads(re.sub('"s', "'s", re.sub("'", '"', string)))
dct['asin']
'B000RMTGUQ'
EDIT
from the comments below, it seems you do not have a json string but rather a valid python dictionary in string format:
therefore you could directly do:
dc = eval(string)
dc['asin']
Futher more consider using ast.literal_eval rather than eval.
data
string = """{'asin': 'B000RMTGUQ', 'imUrl': 'http://ecx.images-amazon.com/images/I/515KlX4dEUL._BO2,204,203,200_PIsitb-sticker-v3-big,TopRight,0,-55_SX278_SY278_PIkin4,BottomRight,1,22_AA300_SH20_OU01_.jpg', 'related': {'also_bought': ['B009QJMXI8', 'B00CNQ7MJG']}, 'categories': [['Books', 'History', 'World', 'Jewish', 'Holocaust'], ['Books', 'History', 'World', 'Religious', 'Judaism'], ['Books', 'Politics & Social Sciences', 'Social Sciences'], ['Books', 'Religion & Spirituality', 'Judaism'], ['Kindle Store', 'Kindle eBooks', 'History', 'World', 'Jewish', 'Holocaust'], ['Kindle Store', 'Kindle eBooks', 'Politics & Social Sciences', 'Social Sciences'], ['Kindle Store', 'Kindle eBooks', 'Religion & Spirituality', 'Judaism']], 'description': "Kibbutz Buchenwald was founded in Germany in 1945 by 16 survivors of Buchenwald concentration camp. The Zionist training farm was organized to prepare Jews for emigration to Palestine. One of the founders was Yeohezkel Tydor, the author's father, who died in 1993. Baumel's narration of the kibbutz's history is divided into two sections. Part one examines the kibbutz from its creation until the departure of the founding group to Palestine in late summer 1945. Part two traces the kibbutz's subsequent history in Palestine and Germany, from the autumn of 1945 until the mid-1950s. Kibbutz Buchenwald was abolished in Germany in 1948; the kibbutz as it was founded in what is now Israel--named Netzer Sereni--still exists today. The story of these pioneers and their physical, psychological, ideological, and political struggles forms the nucleus of this absorbing book.George Cohen"}"""

Beautiful Soup Scraping

I'm having issues with old working code not functioning correctly anymore.
My python code is scraping a website using beautiful soup and extracting event data (date, event, link).
My code is pulling all of the events which are located in the tbody. Each event is stored in a <tr class="Box">. The issue is that my scraper seems to be stopping after this <tr style ="box-shadow: none;> After it reaches this section (which is a section containing 3 advertisements on the site for events that I don't want to scrape) the code stops pulling event data from within the <tr class="Box">. Is there a way to skip this tr style/ignore future cases?
import pandas as pd
import bs4 as bs
from bs4 import BeautifulSoup
import urllib.request
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
source = urllib.request.urlopen('https://10times.com/losangeles-us/technology/conferences').read()
soup = bs.BeautifulSoup(source,'html.parser')
#---Get Event Data---
test1=[]
table = soup.find('tbody')
table_rows = table.find_all('tr') #find table rows (tr)
for x in table_rows:
data = x.find_all('td') #find table data
row = [x.text for x in data]
if len(row) > 2: #Exlcudes rows with only event name/link, but no data.
test1.append(row)
test1
The data is loaded dynamically via JavaScript, so you don't see more results. You can use this example to load more pages:
import requests
from bs4 import BeautifulSoup
url = "https://10times.com/ajax?for=scroll&path=/losangeles-us/technology/conferences"
params = {"page": 1, "ajax": 1}
headers = {"X-Requested-With": "XMLHttpRequest"}
for params["page"] in range(1, 4): # <-- increase number of pages here
print("Page {}..".format(params["page"]))
soup = BeautifulSoup(
requests.get(url, headers=headers, params=params).content,
"html.parser",
)
for tr in soup.select('tr[class="box"]'):
tds = [td.get_text(strip=True, separator=" ") for td in tr.select("td")]
print(tds)
Prints:
Page 1..
['Tue, 29 Sep - Thu, 01 Oct 2020', 'Lens Los Angeles', 'Intercontinental Los Angeles Downtown, Los Angeles', 'LENS brings together the entire Degreed community - our clients, invited prospective clients, thought leaders, partners, employees, executives, and industry experts for two days of discussion, workshops,...', 'Business Services IT & Technology', 'Interested']
['Wed, 30 Sep - Sat, 03 Oct 2020', 'FinCon', 'Long Beach Convention & Entertainment Center, Long Beach 20.1 Miles from Los Angeles', 'FinCon will be helping financial influencers and brands create better content, reach their audience, and make more money. Collaborate with other influencers who share your passion for making personal finance...', 'Banking & Finance IT & Technology', 'Interested 7 following']
['Mon, 05 - Wed, 07 Oct 2020', 'NetDiligence Cyber Risk Summit', 'Loews Santa Monica Beach Hotel, Santa Monica 14.6 Miles from Los Angeles', 'NetDiligence Cyber Risk Summit will conference are attended by hundreds of cyber risk insurance, legal/regulatory and security/privacy technology leaders from all over the world. Connect with leaders in...', 'IT & Technology', 'Interested']
... etc.

i need help scraping the following information from the webpage

i need to obtain details of federal agencies from a web page that lists the pages alphabetically a-w. I need to get the agency name,website and contact
the code that i have only returns the agency name from one page.
import requests
>>> import bs4
>>> res = requests.get("https://www.usa.gov/federal-agencies/")
>>> soup=bs4.BeautifulSoup(res.text,'lxml')
>>> soup.select('.url')
for i in soup.select('.url'):
print(i.text)
i expect to get; the Agency name, website and contact address for all the pages,from a to page w
You are going to have to iterate through each page, and follow the link of each item to then pull the data you want:
Code:
import requests
import bs4
for letter in map(chr, range(97, 123)):
res = requests.get("https://www.usa.gov/federal-agencies/%s" %letter)
soup=bs4.BeautifulSoup(res.text,'lxml')
section = soup.find('ul', {'class':'one_column_bullet'})
links = [ 'https://www.usa.gov' + i['href'] for i in section.find_all('a', {'class':'url'})]
for link in links:
res2 = requests.get(link)
soup=bs4.BeautifulSoup(res2.text,'lxml')
agency_name = soup.find('h1').text
website = soup.find('h3',{'class':'org'}).findNext('a')['href']
try:
address = soup.find('p',{'class':'spk street-address'}).text.strip()
address = address.split('\n')
address = ' '.join([ i.strip() for i in address if i.strip() != '' ])
except:
address = 'N/A'
print('Name:\t\t%s\nWebsite:\t%s\nAddress:\t%s\n' %(agency_name, website, address))
Output:
Name: U.S. AbilityOne Commission
Website: http://www.abilityone.gov
Address: 1401 S. Clark Street Suite 715 Arlington, VA 22202-3259
Name: U.S. Access Board
Website: http://www.access-board.gov/
Address: 1331 F St., NW Suite 1000 Washington, DC 20004-1111
Name: Administration for Children and Families
Website: http://www.acf.hhs.gov/
Address: 330 C St., SW Washington, DC 20201
Name: Administration for Community Living
Website: http://www.acl.gov
Address: One Massachusetts Ave., NW Washington, DC 20201
Name: Administration for Native Americans
Website: http://www.acf.hhs.gov/programs/ana/
Address: 2nd Floor, West Aerospace Center 370 L'Enfant Promenade, SW Washington, DC 20447-0002
Name: Administrative Conference of the United States
Website: http://acus.gov/
Address: 1120 20th St., NW Suite 706 South Washington, DC 20036
Name: Administrative Office of the U.S. Courts
Website: http://www.uscourts.gov/
Address: One Columbus Circle, NE Washington, DC 20544
...

Unexpected behavior of append() method. Why, when adding a dictionary to the list, are the previous elements of the list overwritten?

The following code checks the presence of the company name (from tickerList) or its fragment in the text of the news (from newsList).
In the case when the company is found in the news print gives out the expected ticker of the company, but after adding this news to the list, something nonsense happens :(
It's looks like, when appending a dictionary (news) to the list (tickersNews), are the previous elements of the list overwritten. Why?
It should be noted that when news appending as a dictionary converted to a string, everything works as it should
import re
tickersList = [('ATI', 'Allegheny rporated', 'Allegheny Technologies Incorporated'), ('ATIS', 'Attis', 'Attis Industries, Inc.'), ('ATKR', 'Atkore International Group', 'Atkore International Group Inc.'), ('ATMP', 'Barclays + Select M', 'Barclays ETN+ Select MLP'), ('ATNM', 'Actinium', 'Actinium Pharmaceuticals, Inc.'), ('ATNX', 'Athenex', 'Athenex, Inc.'), ('ATOS', 'Atossa Genetics', 'Atossa Genetics Inc.'), ('ATRA', 'Atara Biotherapeutics', 'Atara Biotherapeutics, Inc.'), ('ATRC', 'AtriCure', 'AtriCure, Inc.'), ('ATRO', 'Astronics', 'Astronics Corporation'), ('ATRS', 'Antares Pharma', 'Antares Pharma, Inc.'), ('ATSG', 'Air Transport Services Group', 'Air Transport Services Group, Inc.'), ('CJ', 'C&J Energy', 'C&J Energy Services, Inc.'), ('CJJD', 'China Jo-Jo Drugstores', 'China Jo-Jo Drugstores, Inc.'), ('CLAR', 'Clarus', 'Clarus Corporation'), ('CLD', 'Cloud Peak Energy', 'Cloud Peak Energy Inc.'), ('CLDC', 'China Lending', 'China Lending Corporation'), ('CLDR', 'Cloudera', 'Cloudera, Inc.')]
newsList = [
{'title':'Atara Biotherapeutics Announces Planned Chief Executive Officer Transition'},
{'title':'Chongqing Jingdong Pharmaceutical and Athenex Announce a Strategic Partnership and Licensing Agreement to Develop and Commercialize KX2-391 in China'}
]
tickersNews = []
for news in newsList:
# pass through the list of companies looking for their mention in the news
for ticker, company, company_full in tickersList:
# clear the full name of the company from brackets, spaces, articles,
# points and commas and save fragments of the full name to the list
companyFullFragments = company_full.replace(',', '')\
.replace('.', '').replace('The ', ' ')\
.replace('(', ' ').replace(')', ' ')\
.replace(' ', ' ').strip().split()
# looking for a company in the news every time cutting off
# the last fragment from the full company name
for i in range(len(companyFullFragments), 0, -1):
companyFullFragmentsString = ' '.join(companyFullFragments[:i]).strip()
lookFor_company = r'(^|\s){0}(\s|$)'.format(companyFullFragmentsString)
results_company = re.findall(lookFor_company, news['title'])
# if the title of the news contains the name of the company,
# then we add the ticker, the found fragment and the full name
# of the company to the news, print the news and add it to the list
if results_company:
news['ticker'] = ticker#, companyFullFragmentsString, company_full
print(news['ticker'], 'found')
#tickersNews.append(str(news))
#-----------------------------Here is the problem!(?)
tickersNews.append(news)
# move on to the next company
break
print(20*'-', 'appended:')
for news in tickersNews:
print(news['ticker'])
Output (list of dict):
ATRA found
ATNX found
CJJD found
CLDC found
-------------------- appended:
ATRA
CLDC
CLDC
CLDC
Output (list of strings):
ATRA found
ATNX found
CJJD found
CLDC found
-------------------- appended as a strings:
["{'title': 'Atara Biotherapeutics Announces Planned Chief Executive Officer Transition', 'ticker': 'ATRA'}", "{'title': 'Chongqing Jingdong Pharmaceutical and Athenex Announce a Strategic Partnership and Licensing Agreement to Develop and Commercialize KX2-391 in China', 'ticker': 'ATNX'}", "{'title': 'Chongqing Jingdong Pharmaceutical and Athenex Announce a Strategic Partnership and Licensing Agreement to Develop and Commercialize KX2-391 in China', 'ticker': 'CJJD'}", "{'title': 'Chongqing Jingdong Pharmaceutical and Athenex Announce a Strategic Partnership and Licensing Agreement to Develop and Commercialize KX2-391 in China', 'ticker': 'CLDC'}"]
The problem originates from 2 lines: news['ticker'] = ticker and tickersNews.append(news) which are located inside for loop. Much simpler version of your problem is:
a = 10
a = 20
a = 30
print(a, a, a)
Output will be 30 30 30. I guess it's obvious.
To solve the problem you may use several approaches.
First possibility (easiest). Replace tickersNews.append(news) with tickersNews.append(news.copy()).
Second possibility (preferable). Don't use tickersNews. For every news create empty list news['ticker_list'] = list(). For every ticker append it to news['ticker_list']:
import re
tickersList = [('ATI', 'Allegheny rporated', 'Allegheny Technologies Incorporated'), ('ATIS', 'Attis', 'Attis Industries, Inc.'), ('ATKR', 'Atkore International Group', 'Atkore International Group Inc.'), ('ATMP', 'Barclays + Select M', 'Barclays ETN+ Select MLP'), ('ATNM', 'Actinium', 'Actinium Pharmaceuticals, Inc.'), ('ATNX', 'Athenex', 'Athenex, Inc.'), ('ATOS', 'Atossa Genetics', 'Atossa Genetics Inc.'), ('ATRA', 'Atara Biotherapeutics', 'Atara Biotherapeutics, Inc.'), ('ATRC', 'AtriCure', 'AtriCure, Inc.'), ('ATRO', 'Astronics', 'Astronics Corporation'), ('ATRS', 'Antares Pharma', 'Antares Pharma, Inc.'), ('ATSG', 'Air Transport Services Group', 'Air Transport Services Group, Inc.'), ('CJ', 'C&J Energy', 'C&J Energy Services, Inc.'), ('CJJD', 'China Jo-Jo Drugstores', 'China Jo-Jo Drugstores, Inc.'), ('CLAR', 'Clarus', 'Clarus Corporation'), ('CLD', 'Cloud Peak Energy', 'Cloud Peak Energy Inc.'), ('CLDC', 'China Lending', 'China Lending Corporation'), ('CLDR', 'Cloudera', 'Cloudera, Inc.')]
newsList = [
{'title':'Atara Biotherapeutics Announces Planned Chief Executive Officer Transition'},
{'title':'Chongqing Jingdong Pharmaceutical and Athenex Announce a Strategic Partnership and Licensing Agreement to Develop and Commercialize KX2-391 in China'}
]
for news in newsList:
news['ticker_list'] = list()
for ticker, company, company_full in tickersList:
companyFullFragments = company_full.replace(',', '')\
.replace('.', '').replace('The ', ' ')\
.replace('(', ' ').replace(')', ' ')\
.replace(' ', ' ').strip().split()
for i in range(len(companyFullFragments), 0, -1):
companyFullFragmentsString = ' '.join(companyFullFragments[:i]).strip()
lookFor_company = r'(^|\s){0}(\s|$)'.format(companyFullFragmentsString)
results_company = re.findall(lookFor_company, news['title'])
if results_company:
news['ticker_list'].append(ticker)
# print(ticker, 'found')
break
print('tickers for news:')
for news in newsList:
print(news['ticker_list'])
Output will be:
tickers for news:
['ATRA']
['ATNX', 'CJJD', 'CLDC']

Resources