I'm very new to python and got an assignment asking me to:
Design your own code in do something here part to save the title, id, share count
and comment count of each news media in separated columns of a Excel (.xls) file.
Design your own code to read the share count and comment count from the Excel
file created in step 3, and calculate the average share count and comment count of
those news media websites.
Here is my current code:
from urllib import request
import json
from pprint import pprint
import xlwt
'''
import xlrd
from xlutils import copy
'''
website_list = [
'http://www.huffingtonpost.com/',
'http://www.cnn.com/',
'https://www.nytimes.com/',
'http://www.foxnews.com/',
'http://www.nbcnews.com/'
] # place your list of website urls, e.g., http://jmu.edu
for website in website_list:
url_str = 'https://graph.facebook.com/'+website # create the url for facebook graph api
response = request.urlopen(url_str) # read the reponse into computer
html_str = response.read().decode("utf-8") # convert the reponse into string
json_data = json.loads(html_str) # convert the string into json
pprint (json_data)
book = xlwt.Workbook()
sheet_test = book.add_sheet('keys')
sheet_test.write(0,0,'Title')
sheet_test.write(0,1,'ID')
sheet_test.write(0,2,'Share Count')
sheet_test.write(0,3,'Comment Count')
for i in range(0,5):
for website in website_list[i]:
sheet_test.write(i,0,json_data['og_object']['title'])
sheet_test.write(i,1,json_data['id'])
sheet_test.write(i,2,json_data['share']['share_count'])
sheet_test.write(i,3,json_data['share']['comment_count'])
book.save('C:\\Users\\stinesr\\Downloads\\Excel\\keys.xls')
'''
reading_book = xlrd.open_workbook('C:\\Users\\stinesr\\Downloads\\Excel\\key.xls')
sheet_read = reading_book.sheet_by_name('keys')
num_record = sheet_read.nrows
writing_book = copy(reading_book)
sheet_write = writing_book.get_sheet(0)
print(sheet_write.name)
for i in range(num_record):
row = sheet_read.row_values(i)
if i == 0:
sheet_write.write(0,4,'Share Count Average')
sheet_write.write(0,5,'Comment Count Average')
else:
sheet_write.write(i,4,row[2])
sheet_write.write(i,5,row[3])
writing_book.save('C:\\Users\\stinesr\\Downloads\\Excel\\keys.xls')
'''
Any and all help is appreciated, thank you.
The Traceback error says in the nested for-loops on lines 40-45 you are attempting to overwrite the row 0 from the previous lines. You need to start from row 1, since row 0 already contains the header.
But before that, json_data is only keeping the last response, you'll want to create a list of "responses" and append each response to that list.
You need only one for-loop at line 40:
In summary:
website_list = [
'http://www.huffingtonpost.com/',
'http://www.cnn.com/',
'https://www.nytimes.com/',
'http://www.foxnews.com/',
'http://www.nbcnews.com/'
] # place your list of website urls, e.g., http://jmu.edu
json_list = []
for website in website_list:
url_str = 'https://graph.facebook.com/' + website # create the url for facebook graph api
response = request.urlopen(url_str) # read the reponse into computer
html_str = response.read().decode("utf-8") # convert the reponse into string
json_data = json.loads(html_str) # convert the string into json
json_list.append(json_data)
pprint (json_list)
book = xlwt.Workbook()
sheet_test = book.add_sheet('keys')
sheet_test.write(0,0,'Title')
sheet_test.write(0,1,'ID')
sheet_test.write(0,2,'Share Count')
sheet_test.write(0,3,'Comment Count')
for i in range(len(json_list)):
sheet_test.write(i+1, 0, json_list[i]['og_object']['title'])
sheet_test.write(i+1, 1, json_list[i]['id'])
sheet_test.write(i+1, 2, json_list[i]['share']['share_count'])
sheet_test.write(i+1, 3, json_list[i]['share']['comment_count'])
book.save('C:\\Users\\stinesr\\Downloads\\Excel\\keys.xls')
Should give you an Excel document that resembles:
Related
I have written the code to extract the numbers and the company name from the extracted pdf file.
sample pdf content:
#88876 - Sample1, GTRHEUSKYTH, -99WED,-0098B
#99945 - SAMPLE2, DJWHVDFWHEF, -8876D,-3445G
The above example is what my pdf file contains. I wanted to extract the App number which is after # (i.e) five numbers(88876) and App name which is after the (-) (i.e) Sample1. An write that to an excel file as separate columns which is App_number and App_name.
Please refer the below code which I have tried.
import PyPDF2, re
import csv
for k in range(1,100):
pdfObj = open(r"C:\\Users\merge.pdf",'rb')
object = PyPDF2.PdfFileReader("C:\\Users\merge.pdf")
pdfReader = PyPDF2.PdfFileReader(pdfObj)
NumPages = object.getNumPages()
pdfReader.numPages
for i in range(0, NumPages):
pdfPageObj = pdfReader.getPage(i)
text = pdfPageObj.extractText()
x=re.findall('(?<=#).[0-9]+', text)
y=re.findall("(?<=\- )(.*?)(?=,)", text)
print(x)
print(y)
with open("out.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(x)
Please pour some suggestions.
Try this:
text = '#88876 - Sample1, GTRHEUSKYTH'
App_number = re.search('(?<=#).[0-9]+', text).group()
App_name = re.search("(?<=\- )(.*?)(?=,)", text).group()
In the first regex you get the first consecutive digits after #, in the second one you get everything between - and ,
Hope it helped
I am trying to scrape data from a webpage and save the scraped text in JSON format.
I have reached until the step where i can gather text which i want but then i cant save it in expected format. Csv or txt format is also sufficient if possible
Please help me how to save scraped text in JSON. Here is my code which i have extracted
for k in range(0, len(op3)):
selectweek.select_by_index(k)
table = driver.find_element_by_xpath("//table[#class='list-table']")
for row in table.find_elements_by_xpath('//*[#id="dvFixtureInner"]/table/tbody/tr[2]/td[6]/a'):
row.click()
mainpage = driver.window_handles[0]
print(mainpage)
popup = driver.window_handles[1]
driver.switch_to.window(popup)
time.sleep(3)
#Meta details of match
team1 = driver.find_element_by_xpath('//*[#id="match-details"]/div/div[1]/div/div[2]/div[1]/div[1]/a') #Data to save
team2 = driver.find_element_by_xpath('//*[#id="match-details"]/div/div[1]/div/div[2]/div[3]/div[1]/a') #Data to save
ht = driver.find_element_by_xpath('//*[#id="dvHTScoreText"]') #Data to save
ft = driver.find_element_by_xpath('//*[#id="dvScoreText"]') #Data to save
Create dictionary and convert it into JSON format using json module.
import json
dictionary = {"team1" : team1, "team2": team2, "ht": ht, "ft": ft}
json_dump = json.dumps(dictionary)
with open("YourFilePath", "w") as f:
f.write(json_dump)
You can create a dictionary and add key-value to it. I don't know the structure of the json but this can give an idea:
json_data = dict()
ht = 1
ft = 2
json_data["team1"] = {"ht": ht, "ft": ft}
print(json_data)
>>> {'team1': {'ht': 1, 'ft': 2}}
I am new to python but I am wondering if I can click on the links I get in texts from a particular number. These texts are potential leads for my business and I want to know if I can automatically click on the link from the texts which would secure the lead. Thanks in advance!
What i understand from your question is that you get a text , and you wanna get on the links that this text contains
First you can extract the links from text ( extract the lines starting with "http" and finishing with "com" or "net" ... etc
for that you can use regular expressions , like the following example :
import re
text = 'https://first.com then this one https://second.com'
liste = re.findall('https(.+?)com',text)
your_list = list()
for e in liste :
your_list.append('https'+e+'com')
print(your_list)
Then you will store these links as Strings on a list of Strings
It is the your_list that contains all links
For each link on the list , try an http request using a web service client program that will be included on your algorithm , and then get the html or json result depending on the link , and here you can execute the securing task that you want
Here we can send http request using GET or POST like the following
First you'll have to install a package called requests
pip install requests
I will show you as an example a request for an API that will return the gender of the given name
import requests
url = "https://gender-api.com/get?name=elizabeth"
r = requests.get(url=url) # using GET
data = r.json()
print(data) # returns the json result
And here you can make the operation that you want
Here is a complete example
import re
text = 'This text contains a link that i wanna check it manually
https://gender-api.com/get?name=elizabeth i dont know' \
'where is it'
liste = re.findall('https(.+?)elizabeth',text)
your_list = list()
for e in liste :
your_list.append('https'+e+'elizabeth')
import requests
for url in your_list:
r = requests.get(url=url)
data = r.json()
print(data)
Output
{'name': 'elizabeth', 'errno': 40, 'errmsg': 'invalid or missing key', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '32ms', 'credits_used': 1}
Loading a json file to a list, converting to a dictionary in order to load into the pygal worldmap. When I print the dictionary the data looks ok (to me), however the map opens (from an svg file) but with no data plotted.
No traceback errors are occuring.
I'm not running the latest version of pygal.
Sample output from dictionary:
{'AF': 9733784, 'AL': 1437590, 'DZ': 92215683, 'AO': 17394550, 'AG': 19061, 'AR': 0}
Code below:
import json
import pygal
# Load the data into a list.
filename = 'test.json'
with open(filename, 'rb') as f:
sr_data = json.load(f)
# Print sr_data rows.
sr_exp = {}
for sr_dict in sr_data:
country_code = sr_dict['CountryCode']
gross = int(float(sr_dict['Exposed']))
if country_code:
sr_exp[country_code] = gross
# Create map.
wm = pygal.Worldmap()
wm.title = 'SR Data'
wm.add('',sr_exp)
wm.render_to_file('sr.svg')
The country codes used as keys in your dictionary need to be in lower case. The simplest fix would be to change the line
sr_exp[country_code] = gross
to
sr_exp[country_code.lower()] = gross
I've written a script which is scraping Address and Phone number of certain shops based on Name and Lid. The way it is searching is that It takes Name and Lid stored in column A and Column B respectively from a csv file. However, after fetching the result based on the search, I expected the parser to put that results in column C and column D respectively as it is shown in the second Image. At this point, I got stuck. I don't know how to manipulate Third and Fourth column using reading or writing method so that the data should be placed there. I'm trying with this now:
import csv
import requests
from lxml import html
Names, Lids = [], []
with open("mytu.csv", "r") as f:
reader = csv.DictReader(f)
for line in reader:
Names.append(line["Name"])
Lids.append(line["Lid"])
with open("mytu.csv", "r") as f:
reader = csv.DictReader(f)
for entry in reader:
Page = "https://www.yellowpages.com/los-angeles-ca/mip/{}-{}".format(entry["Name"].replace(" ","-"), entry["Lid"])
response = requests.get(Page)
tree = html.fromstring(response.text)
titles = tree.xpath('//article[contains(#class,"business-card")]')
for title in titles:
Address= title.xpath('.//p[#class="address"]/span/text()')[0]
Contact = title.xpath('.//p[#class="phone"]/text()')[0]
print(Address,Contact)
How my csv file looks like now:
My desired output is something like:
You can do it like this. Create a fresh output csv file whose header is based on the input csv, with the addition of the two columns. When you read a csv row it's available as a dictionary, in this case called entry. You can add the new values to this dictionary from the stuff you've gleaned on the 'net. Then write each newly created row out to file.
import csv
import requests
from lxml import html
with open("mytu.csv", "r") as f, open('new_mytu.csv', 'w', newline='') as g:
reader = csv.DictReader(f)
newfieldnames = reader.fieldnames + ['Address', 'Phone']
writer = csv.writer = csv.DictWriter(g, fieldnames=newfieldnames)
writer.writeheader()
for entry in reader:
Page = "https://www.yellowpages.com/los-angeles-ca/mip/{}-{}".format(entry["Name"].replace(" ","-"), entry["Lid"])
response = requests.get(Page)
tree = html.fromstring(response.text)
titles = tree.xpath('//article[contains(#class,"business-card")]')
#~ for title in titles:
title = titles[0]
Address= title.xpath('.//p[#class="address"]/span/text()')[0]
Contact = title.xpath('.//p[#class="phone"]/text()')[0]
print(Address,Contact)
new_row = entry
new_row['Address'] = Address
new_row['Phone'] = Contact
writer.writerow(new_row)