As stated about I have a homework assignment for a fundamentals of Data Science class. I am filtering out a tower with faulty information and plotting the data of the good tower by amplitude and timing.
The issue is with my mean line for my graph. It is suppose to run through the average of my points. Unfortunately I cannot seem to align across my X-axis.
My output looks like this:
I've tried solution I've found on stack overflow, but the best I could come up was a mean line for the whole graph using:mplot.plot(np.unique(columnOneF),np.poly1d(np.polyfit(columnOneF,columnTwoF,1))(np.unique(columnOneF)))
import csv
import matplotlib.pyplot as mplot
import numpy as np
File = open("WhiteSwordfish_ch1.csv")
csv_file = csv.reader(File)
columnOneF = []
columnTwoF = []
columnThreeF = []
MeanAmp = []
Freq = []
TempFreq = []
last = 0
for row in csv_file: # Loop graps all the rows out of the CSV File stores them by column in List
if float(row[2]) == 21.312057: # If statement check if the frequency if from the good tower if
Freq.append(row) # so it then grabs THE WHOLE ROW and stores in a a List
for row in Freq: # Program loops through only the good tower's data and sorts it into
columnOneF.append(float(row[0])) # Seperate list by type
columnTwoF.append(float(row[1]))
columnThreeF.append(float(row[2]))
# Mean Line Calculation
for i in Freq:
current = float(i[0])
if current == last:
TempFreq.append(float(i[1]))
else:
last = current
MeanAmp.append(np.mean(TempFreq))
# MeanAmp.insert(int(current), np.mean(TempFreq))
TempFreq = []
print(MeanAmp)
print(columnOneF)
# Graph One (Filter Data)
# ****************************************************************************
mplot.title("Filtered Data")
mplot.xlabel("Timing")
mplot.ylabel("Amplitude")
mplot.axis([-100, 800, -1.5, 1.5])
mplot.scatter(columnOneF, columnTwoF, color="red") # Clean Data POINTS
mplot.plot(MeanAmp, color="blue", linestyle="-") # Line
# mplot.plot(np.unique(columnOneF),np.poly1d(np.polyfit(columnOneF,columnTwoF,1))(np.unique(columnOneF)))
mplot.show() # Displays both graphs
You have passed only MeanAmp to the plot() function, which is interpreted as
plot(y) # plot y using x as index array 0..N-1
Source
If you provide x-cordinates, same as for the scatter() function, the lines will be aligned:
mplot.plot(columnOneF, MeanAmp, color="blue", linestyle="-")
Related
This code is for MLX90640 infrared thermal camera. It plots a real-time temperature map across 768 (24x32) pixels using a Raspberry Pi that operates at roughly 1 frame per second. It also saves temperature data in CSV file. it wrights row per second where column A time (HH:MM:SS) then 768 readings from column "B" to column "ACN" but the problem is data in the first and last columns are mixed with double quotes and brackets e.g column "A" is 18:03:38 "[39.1 and column "ACN" is 36.8]" I used pop method and del method to delete " [ ] " but both shows out of index range. Any idea what cause this problem.
import RPi.GPIO as GPIO
import time,board,busio
import numpy as np
import adafruit_mlx90640
import matplotlib.pyplot as plt
from adafruit_blinka import Enum, Lockable, agnostic
import csv
import datetime
i2c = busio.I2C(board.SCL, board.SDA, frequency=800000) # setup I2C for thermal camera
thermal_mapfile = str(datetime.datetime.now().date()) + '_' + str(datetime.datetime.now().time()).replace(':', '.')
thermal_mapfile = thermal_mapfile[:16] #limit thermal file name to 16 characters
print("Thermal cam is ON")
mlx = adafruit_mlx90640.MLX90640(i2c) # begin MLX90640 with I2C comm
mlx.refresh_rate = adafruit_mlx90640.RefreshRate.REFRESH_2_HZ # set refresh rate 2Hz
mlx_shape = (24,32)
print("Initialized")
# setup the figure for plotting
plt.ion() # enables interactive plotting
fig,ax = plt.subplots(figsize=(12,7))
therm1 = ax.imshow(np.zeros(mlx_shape),vmin=0,vmax=60) #start plot with zeros
cbar = fig.colorbar(therm1) # setup colorbar for temps
cbar.set_label('Temperature [$^{\circ}$C]',fontsize=14) # colorbar label
t_array = []
frame = [0] * 768
t1 = time.monotonic()
while True:
try:
mlx.getFrame(frame) # read MLX temperatures into frame var
data_array = (np.reshape(frame,mlx_shape)) # reshape to 24x32
therm1.set_data(np.fliplr(data_array)) # flip left to right
therm1.set_clim(vmin=np.min(data_array),vmax=np.max(data_array)) # set bounds
cbar.update_normal(therm1) # update colorbar range
plt.title(f"Max Temp: {np.max(data_array):.1f}C")
plt.pause(0.001) # required
t_array.append(time.monotonic()-t1)
except ValueError:
continue # if error, just read again
for h in range(24):
for w in range(32):
t = frame[h*32 + w]
frame = list(np.around(np.array(frame),1)) #round array elements to one decimal point
with open("/home/pi/Thermal_Camera/"+thermal_mapfile+".csv","a") as thermalfile:
writer = csv.writer(thermalfile,delimiter=" ")
unix_time = time.time()
formatted_time = datetime.datetime.fromtimestamp(unix_time).strftime('%H:%M:%S')
writer.writerow([formatted_time,frame])
An example of what I am talking about:
import csv
import datetime
hdrs = ['dt','a', 'b', 'c']
data_list = [1, 2, 3]
#Case 1, passing a list directly.
with open('csv_list_test.csv', 'w') as csv_file:
csv_writer = csv.writer(csv_file, delimiter='|')
csv_writer.writerow(hdrs)
csv_writer.writerow([datetime.datetime.now().isoformat(), data_list])
cat csv_list_test.csv
dt|a|b|c
2023-01-24T17:17:44.961821|[1, 2, 3]
# Case 2, unpack list.
with open('csv_list_test.csv', 'w') as csv_file:
csv_writer = csv.writer(csv_file, delimiter='|')
csv_writer.writerow(hdrs)
csv_writer.writerow([datetime.datetime.now().isoformat(), *data_list])
cat csv_list_test.csv
dt|a|b|c
2023-01-24T17:18:32.337160|1|2|3
I use a delimiter that makes it easy to distinguish the columns. delimiter=" " is not a good idea.
In Case 1 you can see that the list is all in column a.
In Case 2 unpacking(*data_list) the list puts the individual elements in the appropriate columns.
I have a series of n files that I'd like to read in parallel using mpi4py. Every file contains a column vector and, as final result, I want to obtain a matrix containing all the single vectors as X = [x1 x2 ... xn].
In the first part of the code I create the list containing all the names of the files and I distribute part of the list to the different cores through the scatter method.
import numpy as np
import pandas as pd
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()
folder = "data/" # Input directory
files = [] # File List
# Create File List -----------------------------------------------------------
if rank == 0:
for i in range(1,2000):
filename = "file_" + str(i) + ".csv"
files = np.append(files,filename)
print("filelist complete!")
# Determine the size of each sub task
ave, res = divmod(files.size, nprocs)
counts = [ave + 1 if p < res else ave for p in range(nprocs)]
# Determine starting and ending indices of each sub-task
starts = [sum(counts[:p]) for p in range(nprocs)]
ends = [sum(counts[:p+1]) for p in range(nprocs)]
# Convert data into list of arrays
fileList = [files[starts[p]:ends[p]] for p in range(nprocs)]
else:
fileList = None
fileList = comm.scatter(fileList, root = 0)
Here I create a matrix X where to store the vectors.
# Variables Initialization ---------------------------------------------------
# Creation Support Vector
vector = pd.read_csv(folder + fileList[0])
vector = vector.values
vectorLength = len(vector)
# Matrix
X = np.ones((vectorLength, len(fileList)))
# ----------------------------------------------------------------------------
Here, I read the different files and I append the column vector to the matrix X. With the gather method I store all the X matrix calculated by the single cores into one single matrix X. The X matrix resulting from the gather method is a list of 2D numpy arrays. As final step, I reorganize the list X into a matrix
# Reading Files -----------------------------------------------------------
for i in range(len(fileList)):
data = pd.read_csv(folder + fileList[i])
data = np.array(data.values)
X[:,i] = data[:,0]
X = comm.gather(X, root = 0)
if rank == 0:
X_tot = np.empty((vectorLength, 1))
for i in range(nprocs):
X_proc = np.array(X[i])
X_tot = np.append(X_tot, X_proc, axis=1)
X_tot = X_tot[:,1:]
X = X_tot
del X_tot
print("printing X", X)
The code works fine. I tested it on a small dataset and did what it is meant to do. However I tried to run it on a large dataset and I got the following error:
X = comm.gather(X[:,1:], root = 0)
File "mpi4py/MPI/Comm.pyx", line 1578, in mpi4py.MPI.Comm.gather
File "mpi4py/MPI/msgpickle.pxi", line 773, in mpi4py.MPI.PyMPI_gather
File "mpi4py/MPI/msgpickle.pxi", line 778, in mpi4py.MPI.PyMPI_gather
File "mpi4py/MPI/msgpickle.pxi", line 191, in mpi4py.MPI.pickle_allocv
File "mpi4py/MPI/msgpickle.pxi", line 182, in mpi4py.MPI.pickle_alloc
SystemError: Negative size passed to PyBytes_FromStringAndSize
It seems a really general error, however I could process the same data in serial mode without problems or in parallel without using all the n files. I also noticed that only the rank 0 core seems to work, while the others seem to do nothing.
This is my first project using mpi4py so I'm sorry if the code is not perfect and if I have committed any conceptual mistake.
This error typically occurs when the data passed between MPI processes exceeds a certain size (I think 2GB). It's supposed to be fixed with future MPI versions, but for now, you'll probably have to resort to a workaround like storing your data on the hard disk and reading it with each process separately...
See for example here: https://github.com/mpi4py/mpi4py/issues/23
So, I'm trying to generate some fake random data of a given dimension size. Essentially, I want a dataframe in which the data has a uniform random distribution. The data consist of both continuous and categorical values. I've written the following code, but it doesn't work the way I want it to be.
import random
import pandas as pd
import time
from datetime import datetime
# declare global variables
adv_name = ['soft toys', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_loc = ['location_1', 'location_2', 'location_3',
'location_4', 'location_5']
adv_prod = ['baby product', 'kitchenware', 'electronics',
'mobile phones', 'laptops']
adv_size = [1, 2, 3, 4, 10]
adv_layout = ['static', 'dynamic'] # advertisment layout type on website
# adv_date, start_time, end_time = []
num = 10 # the given dimension
# define function to generate random advert locations
def rand_shuf_loc(str_lst, num):
lst = adv_loc
# using list comprehension
rand_shuf_str = [item for item in lst for i in range(num)]
return(rand_shuf_str)
# define function to generate random advert names
def rand_shuf_prod(loc_list, num):
rand_shuf_str = [item for item in loc_list for i in range(num)]
random.shuffle(rand_shuf_str)
return(rand_shuf_str)
# define function to generate random impression and click data
def rand_clic_impr(num):
rand_impr_lst = []
click_lst = []
for i in range(num):
rand_impr_lst.append(random.randint(0, 100))
click_lst.append(random.randint(0, 100))
return {'rand_impr_lst': rand_impr_lst, 'rand_click_lst': click_lst}
# define function to generate random product price and discount
def rand_prod_price_discount(num):
prod_price_lst = [] # advertised product price
prod_discnt_lst = [] # advertised product discount
for i in range(num):
prod_price_lst.append(random.randint(10, 100))
prod_discnt_lst.append(random.randint(10, 100))
return {'prod_price_lst': prod_price_lst, 'prod_discnt_lst': prod_discnt_lst}
def rand_prod_click_timestamp(stime, etime, num):
prod_clik_tmstmp = []
frmt = '%d-%m-%Y %H:%M:%S'
for i in range(num):
rtime = int(random.random()*86400)
hours = int(rtime/3600)
minutes = int((rtime - hours*3600)/60)
seconds = rtime - hours*3600 - minutes*60
time_string = '%02d:%02d:%02d' % (hours, minutes, seconds)
prod_clik_tmstmp.append(time_string)
time_stmp = [item for item in prod_clik_tmstmp for i in range(num)]
return {'prod_clik_tmstmp_lst':time_stmp}
def main():
print('generating data...')
# print('generating random geographic coordinates...')
# get the impressions and click data
impression = rand_clic_impr(num)
clicks = rand_clic_impr(num)
product_price = rand_prod_price_discount(num)
product_discount = rand_prod_price_discount(num)
prod_clik_tmstmp = rand_prod_click_timestamp("20-01-2018 13:30:00",
"23-01-2018 04:50:34",num)
lst_dict = {"ad_loc": rand_shuf_loc(adv_loc, num),
"prod": rand_shuf_prod(adv_prod, num),
"imprsn": impression['rand_impr_lst'],
"cliks": clicks['rand_click_lst'],
"prod_price": product_price['prod_price_lst'],
"prod_discnt": product_discount['prod_discnt_lst'],
"prod_clik_stmp": prod_clik_tmstmp['prod_clik_tmstmp_lst']}
fake_data = pd.DataFrame.from_dict(lst_dict, orient="index")
res = fake_data.apply(lambda x: x.fillna(0)
if x.dtype.kind in 'biufc'
# where 'biufc' means boolean, integer,
# unicode, float & complex data types
else x.fillna(random.randint(0, 100)
)
)
print(res.transpose())
res.to_csv("fake_data.csv", sep=",")
# invoke the main function
if __name__ == "__main__":
main()
Problem 1
when I execute the above code snippet, it prints fine but when written to csv format, its horizontally positioned; i.e., it looks like this... How do I position it vertically when writing to csv file? What I want is 7 columns (see lst_dict variable above) with n number of rows?
Problem 2
I dont understand why the random date is generated for the first 50 columns and remaining columns are filled with numerical values?
To answer your first question, replace
print(res.transpose())
with
res.transpose() print(res)
To answer your second question look at the length of the output of the method
rand_shuf_loc()
it as well as the other helper functions only produce a list of 50 items.
The creation of res using the method
fake_data.apply
replaces all nan with a random numeric, so it also applies a numeric to the columns without any predefined values.
I'm trying to learn Spark so I'm totally new to it.
I have a file with thousands of lines where each one is structured like:
LFPG;EDDW;00;E170;370;LFPG;EDDW;189930555;150907;1826;!!!!;AFR1724;AFR;0;AFR1724-LFPG-EDDW-20150907180000;N;0;;;245382;;;150907;1800;0;;X;;;;;;;;;;370;;0;20150907175700;AA45458743;;;;;NEXE;NEXE;;;;;20150907180000;;;;245382;;;;;;;;;;;;;;;;;;;;;;;;;;;;AFR;;;;;;;;;;;0
The above line represents flight information from an airplane, it took off from LFPG (1st element) and landed in EDDW (2nd element), the rest of the information is not relevant for the purpose.
I'd like to print or save in a file the top ten busiest airports based on the total number of aircraft movements, that is, airplanes that took off or landed in an airport.
So in a sense, the desired output would be:
AIRPORT_NAME #TOTAL_MOVEMENTS #TAKE-OFFs #LANDINGS
I have already implement this program in python and would like to transform it using the MAP/Reduce paradigm using Spark.
# Libraries
import sys
from collections import Counter
import collections
from itertools import chain
from collections import defaultdict
# START
# Defining default program argument
if len(sys.argv)==1:
fileName = "airports.exp2"
else:
fileName = sys.argv[1]
takeOffAirport = []
landingAirport = []
# Reading file
lines = 0 # Counter for file lines
try:
with open(fileName) as file:
for line in file:
words = line.split(';')
# Relevant data, item1 and item2 from each file line
origin = words[0]
destination = words[1]
# Populating lists
landingAirport.append(destination)
takeOffAirport.append(origin)
except IOError:
print ("\n\033[0;31mIoError: could not open the file:\033[00m %s" %fileName)
airports_dict = defaultdict(list)
# Merge lists into a dictionary key:value
for key, value in chain(Counter(takeOffAirport).items(),
Counter(landingAirport).items()):
# 'AIRPOT_NAME':[num_takeOffs, num_landings]
airports_dict[key].append(value)
# Sum key values and add it as another value
for key, value in airports_dict.items():
#'AIRPOT_NAME':[num_totalMovements, num_takeOffs, num_landings]
airports_dict[key] = [sum(value),value]
# Sort dictionary by the top 10 total movements
airports_dict = sorted(airports_dict.items(),
key=lambda kv:kv[1], reverse=True)[:10]
airports_dict = collections.OrderedDict(airports_dict)
# Print results
print("\nAIRPORT"+ "\t\t#TOTAL_MOVEMENTS"+ "\t#TAKEOFFS"+ "\t#LANDINGS")
for k in airports_dict:
print(k,"\t\t", airports_dict[k][0],
"\t\t\t", airports_dict[k][1][1],
"\t\t", airports_dict[k][1][0])
A test file can be download from: https://srv-file7.gofile.io/download/YCnWxr/traffic1day.exp2
So far I've been able to get the very first and second elements from the file, but I don't know quite well how to implement the filter or reduce in order to obtain the frequency time that each airports appears on each list and then merge both list adding the airport name, the sum of takeOffs and landings and the number of takeoffs and landings.
from pyspark import SparkContext, SparkConf
if __name__ == "__main__":
conf = SparkConf().setAppName("airports").setMaster("local[*]")
sc = SparkContext(conf = conf)
airports = sc.textFile("traffic1hour.exp2", minPartitions=4)
airports = airports.map(lambda line : line.split('\n'))
takeOff_airports = airports.map(lambda sub: (sub[0].split(';')[0]))
landing_airports = airports.map(lambda sub: (sub[0].split(';')[1]))
takeOff_airports.saveAsTextFile("takeOff_airports.txt")
landing_airports.saveAsTextFile("landing_airport.txt")
Any hint or guide it will be much appreciated.
I'm attempting to create a BoxPlot using Bokeh. When I get to the section where I need to identify outliers, it fails if a given category has no outliers.
If I remove the "problem" category, the BoxPlot executes properly. it's only when I attempt to create this BoxPlot with a category that has no outliers it fails.
Any instruction on how to remedy this?
The failure occurs at the commented section "Prepare outlier data for plotting [...]"
import numpy as np
import pandas as pd
import datetime
import math
from bokeh.plotting import figure, show, output_file
from bokeh.models import NumeralTickFormatter
# Create time stamps to allow for figure to display span in title
today = datetime.date.today()
delta1 = datetime.timedelta(days=7)
delta2 = datetime.timedelta(days=1)
start = str(today - delta1)
end = str(today - delta2)
#Identify location of prices
itemloc = 'Everywhere'
df = pd.read_excel(r'C:\Users\me\prices.xlsx')
# Create a list from the dataframe that identifies distinct categories for the separate box plots
cats = df['subcategory_desc'].unique().tolist()
# Find the quartiles and IQR for each category
groups = df.groupby('subcategory_desc', sort=False)
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr
# Find the outliers for each category
def outliers(group):
cat = group.name
return group[(group.price > upper.loc[cat][0]) | (group.price < lower.loc[cat][0])]['price']
out = groups.apply(outliers).dropna()
# Prepare outlier data for plotting, we need coordinates for every outlier.
outx = []
outy = []
for cat in cats:
# only add outliers if they exist
if not out.loc[cat].empty:
for value in out[cat]:
outx.append(cat)
outy.append(value)
I expect that the Box-and-whisker portion of categories with no outliers merely show up without the outlier dots.
Have you tried the code from official documentation, https://docs.bokeh.org/en/latest/docs/gallery/boxplot.html?
# prepare outlier data for plotting, we need coordinates for every outlier.
if not out.empty:
outx = []
outy = []
for keys in out.index:
outx.append(keys[0])
outy.append(out.loc[keys[0]].loc[keys[1]])