Apache Spark - Multiple spark context error - apache-spark

I am getting the Multiple spark context error.
Can anybody help me in resolving this?
If I take parsing.take(1) its running fine. but it is giving Multiple spark context error when i do take > 2 in the last line of my code.
Any help is much appreciated
from pyspark import SparkConf
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
############ IRIS DataSet ##############
iris= sc.textFile("hdfs:///user/edureka/IRIS.csv")
testset,trainingset = iris.randomSplit([1,2])
import numpy as np
def parse_interaction(line):
line_split = line.split(",")
# keep just numeric and logical values
symbolic_indexes = [4] # Specify the columns which has the String values
features = [item for i,item in enumerate(line_split) if i not in symbolic_indexes]
return np.array([float(x) for x in features])
def parse_interaction_label(line):
line_split = line.split(",")
# keep just numeric and logical values
symbolic_indexes = [4] # Specify the columns which has the String values
label = [item for i,item in enumerate(line_split) if i in symbolic_indexes]
return np.array([float(x) for x in label])
features_train = trainingset.map(parse_interaction)
labels_train = trainingset.map(parse_interaction_label)
features_test=testset.map(parse_interaction)
labels_test=testset.map(parse_interaction_label)
def parse_interaction_with_key(line):
line_split = line.split(",")
# keep just numeric and logical values
#symbolic_indexes = [4] # Specify the columns which has the String values
features_label = [item for i,item in enumerate(line_split)]
return (np.array([float(x) for x in features_label]))
features_train_label = trainingset.map(parse_interaction_with_key)
features_test_label= testset.map(parse_interaction_with_key)
product=features_train_label.cartesian(features_test_label)
import math
def distancecal(line):
training_label=line[0]
training=training_label[0:4] # hardcoded the Training Column
train_label = training_label[-1]
testing_label=line[1]
test=testing_label[0:4] # Hardcoded the Testing column Modified the Testing Column
stringtest=str(line[1])
points=zip(training,test)
diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
score = math.sqrt(sum(diffs_squared_distance))
training_label = np.append(training_label,score)
return (stringtest,training_label)
training_label_test_score = product.map(distancecal)
keyvalue=training_label_test_score.groupByKey().mapValues(list)
def sortingvalue(l):
from pyspark import SparkConf
from pyspark import SparkContext
#conf1 = SparkConf()
#conf1.setAppName('Sorting Job Stream')
#conf1.set("spark.ui.port", "36000")
#conf1.set("spark.driver.allowMultipleContexts", "true")
sc1 = SparkContext()
v = sc1.parallelize(l)
vSorted = v.sortBy(lambda a: a[5])
return(vSorted.collect())
def parsekeyvalueforsorting(line):
key=line[0]
cdata=line[1]
scdata=sortingvalue(cdata)
return (key,scdata)
parsing=keyvalue.map(parsekeyvalueforsorting)
print(parsing.take(2))
This is my list is first element is string and the second one is list of array:
[('[ 0.2 1.4 3.4 5.2 0. ]', [array([ 0.2, 1.4, 3. , 4.9, 0. , **0.5**]), array([ 0.2 , 1.3 , 3.2 , 4.7 , 0. ,**0.54772256**]), array([ 0.2 , 1.4 , 3.6 , 5. , 0. ,
0.28284271]), array([ 0.4 , 1.7 , 3.9 , 5.4 , 0. ,
0.64807407]), array([ 0.2 , 1.5 , 3.4 , 5. , 0. ,
0.2236068]), array([ 0.2 , 1.4 , 2.9 , 4.4 , 0. ,
0.94339811]), array([ 0.1 , 1.5 , 3.1 , 4.9 , 0. ,
0.4472136]), array([ 0.2 , 1.5 , 3.7 , 5.4 , 0. ,
0.37416574]), array([ 0.2 , 1.6 , 3.4 , 4.8 , 0. ,
0.4472136]), array([ 0.1 , 1.4 , 3. , 4.8 , 0. ,
0.57445626]), array([ 0.1 , 1.1 , 3. , 4.3 , 0. ,
1.03440804]), array([ 0.4 , 1.5 , 4.4 , 5.7 , 0. ,
1.14017543]), array([ 0.4 , 1.3 , 3.9 , 5.4 , 0. ,
0.58309519]), array([ 0.3 , 1.7 , 3.8 , 5.7 , 0. ,
0.71414284]), array([ 0.3 , 1.5 , 3.8 , 5.1 , 0. ,
0.43588989]), array([ 0.2 , 1.7 , 3.4 , 5.4 , 0. ,
0.36055513]), array([ 0.4 , 1.5 , 3.7 , 5.1 , 0. ,
0.38729833]), array([ 0.2 , 1. , 3.6 , 4.6 , 0. ,
0.74833148]), array([ 0.5 , 1.7 , 3.3 , 5.1 , 0. ,
0.4472136]), array([ 0.2 , 1.9 , 3.4 , 4.8 , 0. ,
0.64031242]), array([ 0.2 , 1.6 , 3. , 5. , 0. ,
0.48989795]), array([ 0.4 , 1.6 , 3.4 , 5. , 0. ,
0.34641016]), array([ 0.2 , 1.5 , 3.5 , 5.2 , 0. ,
0.14142136]), array([ 0.4, 1.5, 3.4, 5.4, 0. , 0.3]), array([ 0.2 , 1.5 , 3.1 , 4.9 , 0. ,
0.43588989]), array([ 0.2 , 1.2 , 3.2 , 5. , 0. ,
0.34641016]), array([ 0.2 , 1.3 , 3.5 , 5.5 , 0. ,
0.33166248]), array([ 0.2 , 1.5 , 3.4 , 5.1 , 0. ,
0.14142136]), array([ 0.3 , 1.3 , 2.3 , 4.5 , 0. ,
1.3114877]), array([ 0.4 , 1.9 , 3.8 , 5.1 , 0. , 0.678233]), array([ 0.3 , 1.4 , 3. , 4.8 , 0. ,
0.57445626]), array([ 0.2 , 1.6 , 3.8 , 5.1 , 0. ,
0.45825757]), array([ 0.2 , 1.4 , 3.2 , 4.6 , 0. ,
0.63245553]), array([ 0.2 , 1.5 , 3.7 , 5.3 , 0. ,
0.33166248]), array([ 0.2 , 1.4 , 3.3 , 5. , 0. ,
0.2236068]), array([ 1.3 , 4. , 2.3 , 5.5 , 1. ,
3.04466747]), array([ 1.5 , 4.6 , 2.8 , 6.5 , 1. ,
3.73898382]), array([ 1.3 , 4.6 , 2.9 , 6.6 , 1. ,
3.69594372]), array([ 1.4 , 3.9 , 2.7 , 5.2 , 1. ,
2.86006993]), array([ 1.5 , 4.2 , 3. , 5.9 , 1. ,
3.19061123]), array([ 1. , 4. , 2.2 , 6. , 1. ,
3.07896086]), array([ 1.3 , 3.6 , 2.9 , 5.6 , 1. ,
2.54165301]), array([ 1.5 , 4.5 , 3. , 5.6 , 1. ,
3.40881211]), array([ 1. , 4.1 , 2.7 , 5.8 , 1. ,
2.96310648]), array([ 1.5 , 4.5 , 2.2 , 6.2 , 1. ,
3.7067506]), array([ 1.3 , 4. , 2.8 , 6.1 , 1. ,
3.02324329]), array([ 1.5 , 4.9 , 2.5 , 6.3 , 1. ,
3.99499687]), array([ 1.2 , 4.7 , 2.8 , 6.1 , 1. ,
3.6138622]), array([ 1.3 , 4.3 , 2.9 , 6.4 , 1. ,
3.36303434]), array([ 1.4 , 4.8 , 2.8 , 6.8 , 1. ,
3.98998747]), array([ 1.7 , 5. , 3. , 6.7 , 1. ,
4.19761837]), array([ 1.5 , 4.5 , 2.9 , 6. , 1. ,
3.49141805]), array([ 1. , 3.5 , 2.6 , 5.7 , 1. ,
2.43721152]), array([ 1.1 , 3.8 , 2.4 , 5.5 , 1. ,
2.7676705]), array([ 1. , 3.7 , 2.4 , 5.5 , 1. ,
2.64952826]), array([ 1.2 , 3.9 , 2.7 , 5.8 , 1. ,
2.84604989]), array([ 1.6 , 5.1 , 2.7 , 6. , 1. ,
4.09633983]), array([ 1.5 , 4.5 , 3. , 5.4 , 1. ,
3.39116499]), array([ 1.3 , 4.4 , 2.3 , 6.3 , 1. ,
3.55387113]), array([ 1.3 , 4. , 2.5 , 5.5 , 1. ,
2.97825452]), array([ 1.2 , 4.4 , 2.6 , 5.5 , 1. ,
3.27566787]), array([ 1. , 3.3 , 2.3 , 5. , 1. ,
2.34520788]), array([ 1.3 , 4.2 , 2.7 , 5.6 , 1. ,
3.1144823]), array([ 1.2 , 4.2 , 3. , 5.7 , 1. ,
3.04138127]), array([ 1.3 , 4.3 , 2.9 , 6.2 , 1. ,
3.2969683]), array([ 2.5 , 6. , 3.3 , 6.3 , 2. ,
5.26022813]), array([ 1.9 , 5.1 , 2.7 , 5.8 , 2. ,
4.17492515]), array([ 2.1 , 5.9 , 3. , 7.1 , 2. ,
5.25642464]), array([ 1.8 , 5.6 , 2.9 , 6.3 , 2. ,
4.65403051]), array([ 2.2 , 5.8 , 3. , 6.5 , 2. ,
5.02095608]), array([ 1.8 , 6.3 , 2.9 , 7.3 , 2. ,
5.5883808]), array([ 1.8 , 5.8 , 2.5 , 6.7 , 2. ,
4.9979996]), array([ 2.5 , 6.1 , 3.6 , 7.2 , 2. ,
5.60535458]), array([ 2. , 5.1 , 3.2 , 6.5 , 2. ,
4.31972221]), array([ 1.9 , 5.3 , 2.7 , 6.4 , 2. ,
4.4754888]), array([ 2.1 , 5.5 , 3. , 6.8 , 2. ,
4.81040539]), array([ 2. , 5. , 2.5 , 5.7 , 2. ,
4.15451562]), array([ 1.8 , 5.5 , 3. , 6.5 , 2. ,
4.60651712]), array([ 2.2 , 6.7 , 3.8 , 7.7 , 2. ,
6.20483682]), array([ 2.3 , 6.9 , 2.6 , 7.7 , 2. ,
6.44592895]), array([ 1.5 , 5. , 2.2 , 6. , 2. ,
4.09023227]), array([ 2. , 4.9 , 2.8 , 5.6 , 2. ,
4.0012498]), array([ 1.8 , 4.9 , 2.7 , 6.3 , 2. ,
4.06324993]), array([ 1.8 , 6. , 3.2 , 7.2 , 2. ,
5.26877595]), array([ 1.6 , 5.8 , 3. , 7.2 , 2. ,
5.04777179]), array([ 2. , 6.4 , 3.8 , 7.9 , 2. ,
5.97411081]), array([ 2.2 , 5.6 , 2.8 , 6.4 , 2. ,
4.84148737]), array([ 1.5 , 5.1 , 2.8 , 6.3 , 2. ,
4.11703777]), array([ 2.3 , 6.1 , 3. , 7.7 , 2. ,
5.7367238]), array([ 2.4 , 5.6 , 3.4 , 6.3 , 2. ,
4.86723741]), array([ 1.8 , 5.5 , 3.1 , 6.4 , 2. ,
4.57165178]), array([ 2.4 , 5.6 , 3.1 , 6.7 , 2. ,
4.98196748]), array([ 2.3 , 5.1 , 3.1 , 6.9 , 2. ,
4.59129611]), array([ 2.3 , 5.9 , 3.2 , 6.8 , 2. ,
5.22111099]), array([ 2.5 , 5.7 , 3.3 , 6.7 , 2. ,
5.10294033]), array([ 2.3 , 5.2 , 3. , 6.7 , 2. ,
4.61085675]), array([ 1.9 , 5. , 2.5 , 6.3 , 2. ,
4.22729228]), array([ 2.3 , 5.4 , 3.4 , 6.2 , 2. ,
4.62709412]), array([ 1.8 , 5.1 , 3. , 5.9 , 2. ,
4.11096096])]), ('[ 0.3 1.4 3.4 4.6 0. ]', [array([ 0.2 , 1.4 , 3. , 4.9 , 0. ,
0.50990195]), array([ 0.2 , 1.3 , 3.2 , 4.7 , 0. ,
0.26457513]), array([ 0.2 , 1.4 , 3.6 , 5. , 0. ,
0.45825757]), array([ 0.4 , 1.7 , 3.9 , 5.4 , 0. ,
0.99498744]), array([ 0.2 , 1.5 , 3.4 , 5. , 0. ,
0.42426407]), array([ 0.2 , 1.4 , 2.9 , 4.4 , 0. ,
0.54772256]), array([ 0.1 , 1.5 , 3.1 , 4.9 , 0. ,
0.47958315]), array([ 0.2 , 1.5 , 3.7 , 5.4 , 0. ,
0.8660254]), array([ 0.2, 1.6, 3.4, 4.8, 0. , 0.3]), array([ 0.1 , 1.4 , 3. , 4.8 , 0. ,
0.48989795]), array([ 0.1 , 1.1 , 3. , 4.3 , 0. ,
0.6164414]), array([ 0.4 , 1.5 , 4.4 , 5.7 , 0. ,
1.49331845]), array([ 0.4 , 1.3 , 3.9 , 5.4 , 0. ,
0.9539392]), array([ 0.3 , 1.7 , 3.8 , 5.7 , 0. ,
1.2083046]), array([ 0.3 , 1.5 , 3.8 , 5.1 , 0. ,
0.64807407]), array([ 0.2 , 1.7 , 3.4 , 5.4 , 0. ,
0.86023253]), array([ 0.4, 1.5, 3.7, 5.1, 0. , 0.6]), array([ 0.2 , 1. , 3.6 , 4.6 , 0. ,
0.45825757]), array([ 0.5 , 1.7 , 3.3 , 5.1 , 0. ,
0.6244998]), array([ 0.2 , 1.9 , 3.4 , 4.8 , 0. ,
0.54772256]), array([ 0.2 , 1.6 , 3. , 5. , 0. ,
0.60827625]), array([ 0.4 , 1.6 , 3.4 , 5. , 0. ,
0.45825757]), array([ 0.2 , 1.5 , 3.5 , 5.2 , 0. ,
0.6244998]), array([ 0.4 , 1.5 , 3.4 , 5.4 , 0. ,
0.81240384]), array([ 0.2 , 1.5 , 3.1 , 4.9 , 0. ,
0.4472136]), array([ 0.2, 1.2, 3.2, 5. , 0. , 0.5]), array([ 0.2 , 1.3 , 3.5 , 5.5 , 0. ,
0.91651514]), array([ 0.2 , 1.5 , 3.4 , 5.1 , 0. ,
0.51961524]), array([ 0.3 , 1.3 , 2.3 , 4.5 , 0. ,
1.10905365]), array([ 0.4 , 1.9 , 3.8 , 5.1 , 0. ,
0.81853528]), array([ 0.3 , 1.4 , 3. , 4.8 , 0. ,
0.4472136]), array([ 0.2 , 1.6 , 3.8 , 5.1 , 0. , 0.678233]), array([ 0.2 , 1.4 , 3.2 , 4.6 , 0. ,
0.2236068]), array([ 0.2 , 1.5 , 3.7 , 5.3 , 0. ,
0.77459667]), array([ 0.2 , 1.4 , 3.3 , 5. , 0. ,
0.42426407]), array([ 1.3 , 4. , 2.3 , 5.5 , 1. ,
3.12729915]), array([ 1.5 , 4.6 , 2.8 , 6.5 , 1. ,
3.95600809]), array([ 1.3 , 4.6 , 2.9 , 6.6 , 1. ,
3.93573373]), array([ 1.4 , 3.9 , 2.7 , 5.2 , 1. ,
2.88270706]), array([ 1.5 , 4.2 , 3. , 5.9 , 1. ,
3.33616546]), array([ 1. , 4. , 2.2 , 6. , 1. ,
3.26343377]), array([ 1.3 , 3.6 , 2.9 , 5.6 , 1. ,
2.66270539]), array([ 1.5 , 4.5 , 3. , 5.6 , 1. ,
3.49428104]), array([ 1. , 4.1 , 2.7 , 5.8 , 1. ,
3.11608729]), array([ 1.5 , 4.5 , 2.2 , 6.2 , 1. ,
3.87943295]), array([ 1.3 , 4. , 2.8 , 6.1 , 1. ,
3.22024844]), array([ 1.5 , 4.9 , 2.5 , 6.3 , 1. ,
4.17013189]), array([ 1.2 , 4.7 , 2.8 , 6.1 , 1. ,
3.78285606]), array([ 1.3 , 4.3 , 2.9 , 6.4 , 1. , 3.591657]), array([ 1.4 , 4.8 , 2.8 , 6.8 , 1. ,
4.23910368]), array([ 1.7 , 5. , 3. , 6.7 , 1. ,
4.41474801]), array([ 1.5 , 4.5 , 2.9 , 6. , 1. ,
3.64142829]), array([ 1. , 3.5 , 2.6 , 5.7 , 1. ,
2.59807621]), array([ 1.1 , 3.8 , 2.4 , 5.5 , 1. ,
2.86530976]), array([ 1. , 3.7 , 2.4 , 5.5 , 1. ,
2.75499546]), array([ 1.2 , 3.9 , 2.7 , 5.8 , 1. ,
2.99833287]), array([ 1.6 , 5.1 , 2.7 , 6. , 1. ,
4.22255847]), array([ 1.5 , 4.5 , 3. , 5.4 , 1. ,
3.4423829]), array([ 1.3 , 4.4 , 2.3 , 6.3 , 1. ,
3.75499667]), array([ 1.3 , 4. , 2.5 , 5.5 , 1. ,
3.06267857]), array([ 1.2 , 4.4 , 2.6 , 5.5 , 1. ,
3.35559235]), array([ 1. , 3.3 , 2.3 , 5. , 1. ,
2.33880311]), array([ 1.3 , 4.2 , 2.7 , 5.6 , 1. ,
3.21403174]), array([ 1.2 , 4.2 , 3. , 5.7 , 1. ,
3.16543836]), array([ 1.3 , 4.3 , 2.9 , 6.2 , 1. ,
3.49571166]), array([ 2.5 , 6. , 3.3 , 6.3 , 2. ,
5.37587202]), array([ 1.9 , 5.1 , 2.7 , 5.8 , 2. ,
4.26380112]), array([ 2.1 , 5.9 , 3. , 7.1 , 2. ,
5.46808925]), array([ 1.8 , 5.6 , 2.9 , 6.3 , 2. ,
4.79895822]), array([ 2.2 , 5.8 , 3. , 6.5 , 2. ,
5.17107339]), array([ 1.8 , 6.3 , 2.9 , 7.3 , 2. ,
5.81377674]), array([ 1.8 , 5.8 , 2.5 , 6.7 , 2. ,
5.17976833]), array([ 2.5 , 6.1 , 3.6 , 7.2 , 2. ,
5.80775344]), array([ 2. , 5.1 , 3.2 , 6.5 , 2. ,
4.49777723]), array([ 1.9 , 5.3 , 2.7 , 6.4 , 2. ,
4.63680925]), array([ 2.1 , 5.5 , 3. , 6.8 , 2. ,
5.0049975]), array([ 2. , 5. , 2.5 , 5.7 , 2. ,
4.22729228]), array([ 1.8 , 5.5 , 3. , 6.5 , 2. ,
4.77807493]), array([ 2.2 , 6.7 , 3.8 , 7.7 , 2. ,
6.43972049]), array([ 2.3 , 6.9 , 2.6 , 7.7 , 2. ,
6.67083203]), array([ 1.5 , 5. , 2.2 , 6. , 2. ,
4.21900462]), array([ 2. , 4.9 , 2.8 , 5.6 , 2. ,
4.0620192]), array([ 1.8 , 4.9 , 2.7 , 6.3 , 2. ,
4.2284749]), array([ 1.8 , 6. , 3.2 , 7.2 , 2. ,
5.49636243]), array([ 1.6 , 5.8 , 3. , 7.2 , 2. ,
5.28866713]), array([ 2. , 6.4 , 3.8 , 7.9 , 2. ,
6.2401923]), array([ 2.2 , 5.6 , 2.8 , 6.4 , 2. ,
4.98497743]), array([ 1.5 , 5.1 , 2.8 , 6.3 , 2. ,
4.28719022]), array([ 2.3 , 6.1 , 3. , 7.7 , 2. ,
5.98832197]), array([ 2.4 , 5.6 , 3.4 , 6.3 , 2. ,
4.9939964]), array([ 1.8 , 5.5 , 3.1 , 6.4 , 2. ,
4.73180727]), array([ 2.4 , 5.6 , 3.1 , 6.7 , 2. ,
5.15266921]), array([ 2.3 , 5.1 , 3.1 , 6.9 , 2. ,
4.80312398]), array([ 2.3 , 5.9 , 3.2 , 6.8 , 2. ,
5.39722151]), array([ 2.5 , 5.7 , 3.3 , 6.7 , 2. ,
5.26782688]), array([ 2.3 , 5.2 , 3. , 6.7 , 2. ,
4.79687398]), array([ 1.9 , 5. , 2.5 , 6.3 , 2. ,
4.38406204]), array([ 2.3 , 5.4 , 3.4 , 6.2 , 2. ,
4.74973683]), array([ 1.8 , 5.1 , 3. , 5.9 , 2. ,
4.21781934])])]
I need to sort the second element of array based on the bold values (5th element) in all the list

There are two hidden questions:
Why do I get the error?
How can I solve the problem?
Answer: The reason for this error is:
http://spark.apache.org/docs/latest/programming-guide.html
Only one SparkContext may be active per JVM. You must stop() the
active SparkContext before creating a new one.
The issue is being caused due to fact that INSIDE a map operation of a spark-context parsing=keyvalue.map(parsekeyvalueforsorting)
You are creating spark-context for each line.
This is the function which is called inside the map operation which creates a new spark-context for every line in keyvalue.
def sortingvalue(l):
from pyspark import SparkConf
from pyspark import SparkContext
sc1 = SparkContext()
v = sc1.parallelize(l)
vSorted = v.sortBy(lambda a: a[5])
return(vSorted.collect())
Briefly reviewing your code, it seems like the function takes a line, and sort it. It seems like it can be done easily in python without any need for spark.
This conflict spark requirement : "Only one SparkContext may be active per JVM"
How can I solve the problem?
Rewriting this function as pure python should solve your error.
In order to sort a list of arrays using python you can use python sorted() function with key which will point to the 5th element in the array:
sorted(l, key=lambda x: x[4])
Example input:
>>> myin = [([ 0.2,1.4,3.,4.9,0.3,0.5]), ([ 0.2,1.3,3.2,4.7,0.4,0.54772256]),([0.2,1.4,3.6,5.,0.1,0.28284271]), ([0.4,1.7,3.9,5.4,0.7,0.64807407]), ([0.2,1.5,3.4,5.,0.6,0.2236068])]
Result:
>>> sorted(myin,key = lambda x: x[4])
[[0.2, 1.4, 3.6, 5.0, 0.1, 0.28284271], [0.2, 1.4, 3.0, 4.9, 0.3, 0.5], [0.2, 1.3, 3.2, 4.7, 0.4, 0.54772256], [0.2, 1.5
, 3.4, 5.0, 0.6, 0.2236068], [0.4, 1.7, 3.9, 5.4, 0.7, 0.64807407]]
>>>
More info about sorted() in python wiki
Note that your input holds the word "array" which should be removed in order to be used in sorted()

Related

Divide list elements into sublisits once they complete condition

My list starts with positive number and i want to divide it once the number becomes negative.But If the following numbers are negative , they should be added to same sublist. so it should look like:
List = [0.3 , 0.5, 0.6, -0.3 , -0.5, 0.6, 0.5 , -0.2 . -0.7 , 0.7 , 0.9 , 0.6 , -0.6]
sublist1 = [0.3 , 0.5, 0.6, -0.3 , -0.5]
sublist2 = [0.6, 0.5 , -0.2 . -0.7]
sublist3 = [0.7 , 0.9 , 0.6 , -0.6] ```
You can find where the sign (np.sign) changes from negative to positive (diff between sign values equals to 1 - (-1) = 2), and create groups based on that with cumsum and groupby, then apply list to create lists:
s = pd.Series(List)
sublist1, sublist2, sublist3 = (
s.groupby(s.apply(np.sign).diff(1).eq(2).cumsum()).apply(list))
print('sublist1 =', sublist1)
print('sublist2 =', sublist2)
print('sublist3 =', sublist3)
Output:
sublist1 = [0.3, 0.5, 0.6, -0.3, -0.5]
sublist2 = [0.6, 0.5, -0.2, -0.7]
sublist3 = [0.7, 0.9, 0.6, -0.6]

Finding the relative proportion for each element of an array against another array

I have two arrays: A and B.
A = np.array([65, 20, 140, 15, 75, 15])
B = np.array([15, 45, 75, 106, 135, 165])
A can be thought of as user input with shape (n,1) where n is arbitrary positive integer.
B can be thought of as fixed spec with shape (m,1) where m is fixed and known.
I'm trying to get the relative proportion of each element of array A with respect to array B. The result would be a matrix M of shape (n, m) as follows:
M
array([[0. , 0.33, 0.67, 0. , 0. , 0. ],
[0.83, 0.17, 0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0.83, 0.17],
[1. , 0. , 0. , 0. , 0. , 0. ],
[0. , 0. , 1. , 0. , 0. , 0. ],
[0.67, 0.33, 0. , 0. , 0. , 0. ]])
In the first row of M, second and third element are 0.33 and 0.67 because 65 (first element of A) is 33%/ 67% split between 45 and 75 (second and third) element of B.
M[0,1] = (75 - 65) / (75-45)
M[0,2] = 1 - M[0,1]
I've been looking around but there seems to be no core function to do this.
Many thanks,
I believe this could be optimized, but it seems to be working:
def f(ar: np.ndarray, br: np.ndarray):
m = np.zeros((ar.shape[0], br.shape[0]))
for ar_el, m_row in zip(ar, m):
left = br[br <= ar_el]
right = br[br >= ar_el]
left_e = left[-1]
right_e = right[0]
if left_e == right_e:
m_row[len(left) - 1] = 1
elif left_e == br[-1]:
m_row[-1] = 1
elif right_e == br[0]:
m_row[0] = 1
else:
m_row[len(left) - 1] = (right_e - ar_el) / (right_e - left_e)
m_row[len(left)] = (ar_el - left_e) / (right_e - left_e)
return m
And the result for your data:
>>> np.around(f(a, b), decimals=2)
array([[0. , 0.33, 0.67, 0. , 0. , 0. ],
[0.83, 0.17, 0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0.83, 0.17],
[1. , 0. , 0. , 0. , 0. , 0. ],
[0. , 0. , 1. , 0. , 0. , 0. ],
[1. , 0. , 0. , 0. , 0. , 0. ]])
Assuming B is sorted and min(B)<=min(A) and max(A)<max(B):
idx = B.searchsorted(A,'right')
d = np.diff((B[idx],A,B[idx-1]),axis=0)
out = np.zeros((A.size,B.size))
np.put_along_axis(out,np.add.outer(idx,(-1,0)),(d/d.sum(axis=0)).T,axis=1)
np.round(out,3)
# array([[0. , 0.333, 0.667, 0. , 0. , 0. ],
# [0.833, 0.167, 0. , 0. , 0. , 0. ],
# [0. , 0. , 0. , 0. , 0.833, 0.167],
# [1. , 0. , 0. , 0. , 0. , 0. ],
# [0. , 0. , 1. , 0. , 0. , 0. ],
# [1. , 0. , 0. , 0. , 0. , 0. ]])

Replace columns in a 2D numpy array by columns from another2D array

I have two 2D arrays, I want to create arrays that are copy of the first one and then replace some columns by others from the second one.
M1 = np.array([[1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
[4.0, 5.0, 6.0, 4.0, 5.0, 6.0]])
M2 = np.array([[1.1, 2.1, 3.1, 1.2, 2.2, 3.2],
[4.1, 5.1, 6.1., 4.2, 5.2, 6.2]])
I want to do a loop that can give the following arrays:
M3 = np.array([[1.1, 2.0, 3.0, 1.2, 2.0, 3.0],
[4.1, 5.0, 6.0, 4.2, 5.0, 6.0]])
M4 = np.array([[1.0, 2.1, 3.0, 1.0, 2.2, 3.0],
[4.0, 5.1, 6.0, 4.0, 5.2, 6.0]])
M5 = np.array([[1.0, 2.0, 3.1, 1.0, 2.0, 3.2],
[4.0, 5.0, 6.1, 4.0, 5.0, 6.2]])
You can use np.where:
selector = [1,0,0,1,0,0]
np.where(selector,M2,M1)
# array([[1.1, 2. , 3. , 1.2, 2. , 3. ],
# [4.1, 5. , 6. , 4.2, 5. , 6. ]])
selector = [0,1,0,0,1,0]
np.where(selector,M2,M1)
# array([[1. , 2.1, 3. , 1. , 2.2, 3. ],
# [4. , 5.1, 6. , 4. , 5.2, 6. ]])
etc.
Or in a loop:
M3,M4,M5 = (np.where(s,M2,M1) for s in np.tile(np.identity(3,bool), (1,2)))
M3
# array([[1.1, 2. , 3. , 1.2, 2. , 3. ],
# [4.1, 5. , 6. , 4.2, 5. , 6. ]])
M4
# array([[1. , 2.1, 3. , 1. , 2.2, 3. ],
# [4. , 5.1, 6. , 4. , 5.2, 6. ]])
M5
# array([[1. , 2. , 3.1, 1. , 2. , 3.2],
# [4. , 5. , 6.1, 4. , 5. , 6.2]])
Alternatively, you can copy M1 and then slice in M2. This is more verbose but should be faster:
n = 3
Mj = []
for j in range(n):
Mp = M1.copy()
Mp[:,j::n] = M2[:,j::n]
Mj.append(Mp)
M3,M4,M5 = Mj

Save a scikit-learn Bunch object

How do I save a scikit-learn Bunch object to a single file? Currently, I save it into several numpy files, which is cumbersome:
from sklearn.datasets import fetch_lfw_people
# Save to files
faces = fetch_lfw_people(min_faces_per_person=60)
np.save('faces_data.npy', faces.data)
np.save('faces_images.npy', faces.images)
np.save('faces_target.npy', faces.target)
np.save('faces_target_names.npy', faces.target_names)
np.save('faces_descr.npy', faces.DESCR)
# Read the files
from sklearn.datasets.base import Bunch
faces = Bunch()
faces['data'] = np.load('faces_data.npy')
faces['images'] = np.load('faces_images.npy')
faces['target'] = np.load('faces_target.npy')
faces['target_names'] = np.load('faces_target_names.npy')
faces['DESCR'] = np.load('faces_descr.npy')
I'm not sure that this will work for all cases but you should be able to save a bunch object as a pickle file.
Example:
from sklearn import datasets
import pickle
iris = datasets.load_iris()
with open('iris.pkl', 'wb') as bunch:
pickle.dump(iris, bunch, protocol=pickle.HIGHEST_PROTOCOL)
with open('iris.pkl', 'rb') as bunch:
df = pickle.load(bunch)
print(df)
Result:
{'data': array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
[5.4, 3.4, 1.7, 0.2],
[5.1, 3.7, 1.5, 0.4],
[4.6, 3.6, 1. , 0.2],
[5.1, 3.3, 1.7, 0.5],
[4.8, 3.4, 1.9, 0.2],
[5. , 3. , 1.6, 0.2],
[5. , 3.4, 1.6, 0.4],
[5.2, 3.5, 1.5, 0.2],
[5.2, 3.4, 1.4, 0.2],
[4.7, 3.2, 1.6, 0.2],
[4.8, 3.1, 1.6, 0.2],
[5.4, 3.4, 1.5, 0.4],
[5.2, 4.1, 1.5, 0.1],
[5.5, 4.2, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.2],
[5. , 3.2, 1.2, 0.2],
[5.5, 3.5, 1.3, 0.2],
[4.9, 3.6, 1.4, 0.1],
[4.4, 3. , 1.3, 0.2],
[5.1, 3.4, 1.5, 0.2],
[5. , 3.5, 1.3, 0.3],
[4.5, 2.3, 1.3, 0.3],
[4.4, 3.2, 1.3, 0.2],
[5. , 3.5, 1.6, 0.6],
[5.1, 3.8, 1.9, 0.4],
[4.8, 3. , 1.4, 0.3],
[5.1, 3.8, 1.6, 0.2],
[4.6, 3.2, 1.4, 0.2],
[5.3, 3.7, 1.5, 0.2],
[5. , 3.3, 1.4, 0.2],
[7. , 3.2, 4.7, 1.4],
[6.4, 3.2, 4.5, 1.5],
[6.9, 3.1, 4.9, 1.5],
[5.5, 2.3, 4. , 1.3],
[6.5, 2.8, 4.6, 1.5],
[5.7, 2.8, 4.5, 1.3],
[6.3, 3.3, 4.7, 1.6],
[4.9, 2.4, 3.3, 1. ],
[6.6, 2.9, 4.6, 1.3],
[5.2, 2.7, 3.9, 1.4],
[5. , 2. , 3.5, 1. ],
[5.9, 3. , 4.2, 1.5],
[6. , 2.2, 4. , 1. ],
[6.1, 2.9, 4.7, 1.4],
[5.6, 2.9, 3.6, 1.3],
[6.7, 3.1, 4.4, 1.4],
[5.6, 3. , 4.5, 1.5],
[5.8, 2.7, 4.1, 1. ],
[6.2, 2.2, 4.5, 1.5],
[5.6, 2.5, 3.9, 1.1],
[5.9, 3.2, 4.8, 1.8],
[6.1, 2.8, 4. , 1.3],
[6.3, 2.5, 4.9, 1.5],
[6.1, 2.8, 4.7, 1.2],
[6.4, 2.9, 4.3, 1.3],
[6.6, 3. , 4.4, 1.4],
[6.8, 2.8, 4.8, 1.4],
[6.7, 3. , 5. , 1.7],
[6. , 2.9, 4.5, 1.5],
[5.7, 2.6, 3.5, 1. ],
[5.5, 2.4, 3.8, 1.1],
[5.5, 2.4, 3.7, 1. ],
[5.8, 2.7, 3.9, 1.2],
[6. , 2.7, 5.1, 1.6],
[5.4, 3. , 4.5, 1.5],
[6. , 3.4, 4.5, 1.6],
[6.7, 3.1, 4.7, 1.5],
[6.3, 2.3, 4.4, 1.3],
[5.6, 3. , 4.1, 1.3],
[5.5, 2.5, 4. , 1.3],
[5.5, 2.6, 4.4, 1.2],
[6.1, 3. , 4.6, 1.4],
[5.8, 2.6, 4. , 1.2],
[5. , 2.3, 3.3, 1. ],
[5.6, 2.7, 4.2, 1.3],
[5.7, 3. , 4.2, 1.2],
[5.7, 2.9, 4.2, 1.3],
[6.2, 2.9, 4.3, 1.3],
[5.1, 2.5, 3. , 1.1],
[5.7, 2.8, 4.1, 1.3],
[6.3, 3.3, 6. , 2.5],
[5.8, 2.7, 5.1, 1.9],
[7.1, 3. , 5.9, 2.1],
[6.3, 2.9, 5.6, 1.8],
[6.5, 3. , 5.8, 2.2],
[7.6, 3. , 6.6, 2.1],
[4.9, 2.5, 4.5, 1.7],
[7.3, 2.9, 6.3, 1.8],
[6.7, 2.5, 5.8, 1.8],
[7.2, 3.6, 6.1, 2.5],
[6.5, 3.2, 5.1, 2. ],
[6.4, 2.7, 5.3, 1.9],
[6.8, 3. , 5.5, 2.1],
[5.7, 2.5, 5. , 2. ],
[5.8, 2.8, 5.1, 2.4],
[6.4, 3.2, 5.3, 2.3],
[6.5, 3. , 5.5, 1.8],
[7.7, 3.8, 6.7, 2.2],
[7.7, 2.6, 6.9, 2.3],
[6. , 2.2, 5. , 1.5],
[6.9, 3.2, 5.7, 2.3],
[5.6, 2.8, 4.9, 2. ],
[7.7, 2.8, 6.7, 2. ],
[6.3, 2.7, 4.9, 1.8],
[6.7, 3.3, 5.7, 2.1],
[7.2, 3.2, 6. , 1.8],
[6.2, 2.8, 4.8, 1.8],
[6.1, 3. , 4.9, 1.8],
[6.4, 2.8, 5.6, 2.1],
[7.2, 3. , 5.8, 1.6],
[7.4, 2.8, 6.1, 1.9],
[7.9, 3.8, 6.4, 2. ],
[6.4, 2.8, 5.6, 2.2],
[6.3, 2.8, 5.1, 1.5],
[6.1, 2.6, 5.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[6.3, 3.4, 5.6, 2.4],
[6.4, 3.1, 5.5, 1.8],
[6. , 3. , 4.8, 1.8],
[6.9, 3.1, 5.4, 2.1],
[6.7, 3.1, 5.6, 2.4],
[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 5.1, 1.9],
[6.8, 3.2, 5.9, 2.3],
[6.7, 3.3, 5.7, 2.5],
[6.7, 3. , 5.2, 2.3],
[6.3, 2.5, 5. , 1.9],
[6.5, 3. , 5.2, 2. ],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'frame': None, 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n \n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU#io.arc.nasa.gov)\n :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 'filename': 'iris.csv', 'data_module': 'sklearn.datasets.data'}

how to subtract each element in a ndarray with each and every element of another ndarray in numpy

for example I have,
a = np.array([[0.4 , 0.87, 0.24, 0.1 ],
[0.6 , 0.93, 0.34, 0.98],
[0.5 , 0.32, 0.09, 0.99],
[0.4 , 0.11, 0.18, 0.65],
[0.5 , 0.98, 0.47, 0.78]])
b = np.array([[0.6 , 0.93 ,0.34 ,0.98],
[0.7 , 0.47 ,0.43, 0.76]])
I want to subtract each element of 'b' from 'a' but without using for loop. I used for loop to get the output but I have several rows and consuming more time.Is there any way of doing it effectively with less time in numpy?
I expect the output to be like :
array([[ 0.2 , 0.06, 0.1 , 0.88],
[ 0. , 0. , 0. , 0. ],
[ 0.1 , 0.61, 0.25, -0.01],
[ 0.2 , 0.82, 0.16, 0.33],
[ 0.1 , -0.05, -0.13, 0.2 ],
[ 0.3 , -0.4 , 0.19, 0.66],
[ 0.1 , -0.46, 0.09, -0.22],
[ 0.2 , 0.15, 0.34, -0.23],
[ 0.3 , 0.36, 0.25, 0.11],
[ 0.2 , -0.51, -0.04, -0.02]])
The order will be different from what you have shown, but the values will be the same with the following method:
>>> np.tile(b, (len(a),1)) - np.tile(a,(len(b),1))
array([[ 0.2 , 0.06, 0.1 , 0.88],
[ 0.1 , -0.46, 0.09, -0.22],
[ 0.1 , 0.61, 0.25, -0.01],
[ 0.3 , 0.36, 0.25, 0.11],
[ 0.1 , -0.05, -0.13, 0.2 ],
[ 0.3 , -0.4 , 0.19, 0.66],
[ 0. , 0. , 0. , 0. ],
[ 0.2 , 0.15, 0.34, -0.23],
[ 0.2 , 0.82, 0.16, 0.33],
[ 0.2 , -0.51, -0.04, -0.02]])
Explanation:
What the code does is create repeats of a and b, so that for each row of a, you have each row of b
>>> np.tile(b, (len(a),1))
array([[0.6 , 0.93, 0.34, 0.98],
[0.7 , 0.47, 0.43, 0.76],
[0.6 , 0.93, 0.34, 0.98],
[0.7 , 0.47, 0.43, 0.76],
[0.6 , 0.93, 0.34, 0.98],
[0.7 , 0.47, 0.43, 0.76],
[0.6 , 0.93, 0.34, 0.98],
[0.7 , 0.47, 0.43, 0.76],
[0.6 , 0.93, 0.34, 0.98],
[0.7 , 0.47, 0.43, 0.76]])
>>> np.tile(a, (len(b),1))
array([[0.4 , 0.87, 0.24, 0.1 ],
[0.6 , 0.93, 0.34, 0.98],
[0.5 , 0.32, 0.09, 0.99],
[0.4 , 0.11, 0.18, 0.65],
[0.5 , 0.98, 0.47, 0.78],
[0.4 , 0.87, 0.24, 0.1 ],
[0.6 , 0.93, 0.34, 0.98],
[0.5 , 0.32, 0.09, 0.99],
[0.4 , 0.11, 0.18, 0.65],
[0.5 , 0.98, 0.47, 0.78]])
Then it's just a matter of subtracting.
[EDIT], since you say your order needs to be preserved, you can do:
>>> np.repeat(b, len(a),axis=0) - np.tile(a,(len(b),1))
array([[ 0.2 , 0.06, 0.1 , 0.88],
[ 0. , 0. , 0. , 0. ],
[ 0.1 , 0.61, 0.25, -0.01],
[ 0.2 , 0.82, 0.16, 0.33],
[ 0.1 , -0.05, -0.13, 0.2 ],
[ 0.3 , -0.4 , 0.19, 0.66],
[ 0.1 , -0.46, 0.09, -0.22],
[ 0.2 , 0.15, 0.34, -0.23],
[ 0.3 , 0.36, 0.25, 0.11],
[ 0.2 , -0.51, -0.04, -0.02]])

Resources