hello~could u help me to solve the questions?
【Anaconda3-4.4.0】
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:123456#localhost:3306/mysql?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
counts = [ i['fullURLId'].value_counts() for i in sql]
counts = pd.concat(counts).groupby(level=0).sum()
counts = counts.reset_index()
counts.columns = ['index', 'num']
counts['type'] = counts['index'].str.extract('(\d{3})')
counts_ = counts[['type', 'num']].groupby('type').sum()
the above codes are normal,but if I add the sentence below,python warns“'DataFrame' object has no attribute 'sort'”
counts_.sort('num', ascending = False)
...Question solved.
The last code should be "counts_.sort_values('num',ascending=False)" instead.
Related
I have a CSV file in which I have tweets with the following column names: File, User, Date 1, month, day, Tweet, Permalink, Retweet count, Likes count, Tweet value, Language, Location.
I want to create a new data frame with tweets from certain cities. I can do it but only for the last city on my list (Girona). So it doesn't add all the rows. Here is my code:
import pandas as pd
import os
path_to_file = "populismo_merge.csv"
df = pd.read_csv(path_to_file, encoding='utf-8', sep=',')
values = df[df['Location'].str.contains("A Coruña",na=False)]
values = df[df['Location'].str.contains("Alava",na=False)]
values = df[df['Location'].str.contains("Albacete",na=False)]
values = df[df['Location'].str.contains("Alicante",na=False)]
values = df[df['Location'].str.contains("Almería",na=False)]
values = df[df['Location'].str.contains("Asturias",na=False)]
values = df[df['Location'].str.contains("Avila",na=False)]
values = df[df['Location'].str.contains("Badajoz",na=False)]
values = df[df['Location'].str.contains("Barcelona",na=False)]
values = df[df['Location'].str.contains("Burgos",na=False)]
values = df[df['Location'].str.contains("Cáceres",na=False)]
values = df[df['Location'].str.contains("Cádiz",na=False)]
values = df[df['Location'].str.contains("Cantabria",na=False)]
values = df[df['Location'].str.contains("Castellón",na=False)]
values = df[df['Location'].str.contains("Ceuta",na=False)]
values = df[df['Location'].str.contains("Ciudad Real",na=False)]
values = df[df['Location'].str.contains("Córdoba",na=False)]
values = df[df['Location'].str.contains("Cuenca",na=False)]
values = df[df['Location'].str.contains("Formentera",na=False)]
values = df[df['Location'].str.contains("Girona",na=False)]
values.to_csv(r'populismo_ciudad.csv', index = False)
Many thanks!!!
Use isin:
import pandas as pd
import os
path_to_file = "populismo_merge.csv"
df = pd.read_csv(path_to_file, encoding='utf-8', sep=',')
cities = ['A Coruña', 'Alava', 'Albacete', 'Alicante', 'Almería',
'Asturias', 'Avila', 'Badajoz', 'Barcelona', 'Burgos',
'Cáceres', 'Cádiz', 'Cantabria', 'Castellón', 'Ceuta',
'Ciudad Real', 'Córdoba', 'Cuenca', 'Formentera', 'Girona']
values = df[df['Location'].isin(cities)]
values.to_csv(r'populismo_ciudad.csv', index = False)
You are overwriting the values variable each time. A more concise answer would be along the lines of.
values= df[df['LocationName'].isin(["A Coruña", "Alava", ......)]
I am facing the error while concatenating the data of the multiple pages and exporting it in the single CSV file. According to my code, the data is exporting upto page 10 but after page number 10 it is working.
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?
hDistName=Buldhana'
chrome_path =
r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('7')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1464')
tableElement = d.find_element_by_id(
'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWi
seRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[[i],table.columns.get_loc('Select')] = surveys
i += 1
print(table)
j=2
while True:
if len(d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j)))>0:
#print( d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(i))[0].get_attribute('href'))
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j))[0].click()
tableElement = d.find_element_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
table1 = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table1.columns = table1.iloc[0]
table1 = table1.iloc[1:]
#print(type(table))
table1 = table1[table1.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate
[href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys =
d.find_element_by_css_selector('textarea').text
table1.iloc[[i],table1.columns.get_loc('Select')] =
surveys
i += 1
#print(table1)
#table = table.append(table1.reindex(columns=table.columns))
table1.columns = table.columns
table = pd.concat([table, table1] ,ignore_index=True)
print(table)
j+=1
else:
break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', encoding='utf-8-sig',index = False )
I am currently writing a machine learning program for school to predict the weather. I have been using this article https://stackabuse.com/using-machine-learning-to-predict-the-weather-part-1/ as my main resource (I have had to adjust as wunderground is no longer free so I have instead been using openweathermap). I was writing the data collection and organization part of my code I received the following error 'AttributeError: 'datetime.datetime' object has no attribute 'striftime'. Sorry in advance for the massive block of code, I figured it would be the best way to troubleshoot the problem. Thank you for any the help. The parts with '** code **' are what I am struggling with
from datetime import datetime
from datetime import timedelta
import time
from collections import namedtuple
import pandas as pd
import requests
import matplotlib.pyplot as plt
#Data collection and Organization
url = 'http://history.openweathermap.org//storage/d12a3df743e650ba4035d2c6d42fb68f.json'
#res = requests.get(url)
#data = res.json()
target_date = datetime(2018, 4, 22)
features = ["date", "temperature", "pressure", "humidity", "maxtemperature", "mintemperature"]
DailySummary = namedtuple("DailySummary", features)
def extra_weather_data(url, target_date, days):
for _ in range(days):
**request = url.format(target_date.striftime('%Y%m%d'))**
respone = requests.get(request)
if response.status_code == 200:
data = response.json()
records.append(DailySummary(
date = target_date,
temperature = data['main']['temp'],
pressure = data['main']['pressure'],
humidity = data['main']['humidity'],
maxtemperature = data['main']['temp_max'],
mintemperature = data['main']['temp_min']))
time.sleep(6)
target_date += timedelta(days=1)
**records = extra_weather_data(url, target_date, 365)**
#Finished data collection now begin to clean and process data using Pandas
df = pd.DataFrame(records, columns=features).set_index('date')
tmp = df[['temperature','pressure','humidty', 'maxtemperature', 'mintemperature']].head(10)
def derive_nth_day_feature(df, feature, N):
rows =df.shape[0]
nth_prior_measurements = [None]*N + [df[feature][i-N] for i in range(N,rows)]
col_name = "{}_{}".format(feature, N)
df[col_name] = nth_prior_measurements
for feature in features:
if feature != 'date':
for N in range(1, 4):
derive_nth_day_feature(df, feature, N)
df.columns
Python 3.6
Bokeh 12.15
I have tried to implement the bokeh example line_on_off.py, but in a for loop with a hover tool and data of varying length. What happens though is that when a line is turned off it turns off the tool tip of any line created after it. For example if I turn off line 1, line 2,3,4 tool tips are disabled, or if I turn off line 3 line 4's tool tip is disabled.
Can I use a hover tool and checkbox widget in a for loop like this? I have seen this multiline example, but my data is of varying length and I do not want to resample because I would like to see if there is bad or missing data.
Code
from bokeh.plotting import figure
from bokeh.models import CheckboxGroup, CustomJS
from bokeh.models import ColumnDataSource
import pandas as pd
from bokeh.models import HoverTool
def create_plot(df_list):
p = figure(x_axis_type = 'datetime')
glyph_dict = {}
labels = []
active = []
items = []
names = 'abcdefghijklmnopqrstuvwxyz'
callback_string = '{}.visible = {} in checkbox.active;'
code_string = ''
i = 0
sources = []
for df in df_list:
legend = df.columns[0]
series = df.iloc[:,0]
labels.append(legend)
x = series.index
y = series.values
source =ColumnDataSource(data = {'x':x,'y':y, 'date': [str(x) for x in x]})
sources.append(source)
line = p.line('x', 'y', source = sources[i])
items.append((legend, [line]))
name = names[i]
line.name = name
code_string += callback_string.format(name, str(i))
glyph_dict.update({name:line})
active.append(i)
i+=1
hover = HoverTool(tooltips=[('date', '#date'),('y', '#y')])
p.add_tools(hover)
checkbox = CheckboxGroup(labels=labels, active=active, width=200)
glyph_dict.update({'checkbox':checkbox})
checkbox.callback = CustomJS.from_coffeescript(args=glyph_dict, code=code_string)
return checkbox, p
Minimal example
import numpy as np
from datetime import datetime, timedelta
from bokeh.layouts import row
from bokeh.plotting import show
df_list = []
start = datetime(2017, 4,1)
end = datetime(2017,5,1)
for i in range(1,5):
date = pd.date_range(start, end, freq = '1w')
shape = len(date)
df = pd.DataFrame(index = date, data = np.random.randn(shape,1))
name = 'df'+ str(i)
df.columns = [name]
end = end + timedelta(weeks = 1)
df_list.append(df)
c,p = create_plot(df_list)
r=row([c,p])
show(r)
In a situation like this, you should probably create a new, separate hover tool for each line, by restricting the renderers property of each hover tool. So, in relation to your code, move the hover tool creation inside the loop, and have it set renderers each time:
line = p.line('x', 'y', source = sources[i])
hover = HoverTool(tooltips=[('date', '#date'),('y', '#y')]
renderers=[line])
p.add_tools(hover)
from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0
You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)