when fitting a BART model via bartMachine package I get the title error.
Here is the code and dput:
#Response must be a factor
train$occ <- as.factor(train$occ)
levels(train$occ) <- c("C0", "C1")
#Creating a tuning grid
tune_grid <- expand.grid(num_trees = c(50, 100, 200),
k = c(2, 3),
alpha = 0.95,
beta = 2,
nu = 3)
#Using caret for cross-validation
train_control <- trainControl(method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE)
#Starting cluster
cluster <- makeCluster(8)
registerDoParallel(cluster)
#Running the BRT model
#Model training here
bart_train <- caret::train(x = train[, 5:9],
y = train$occ,
method = "bartMachine",
metric = "ROC",
trControl = train_control,
tuneGrid = tune_grid)
#Ending cluster
stopCluster(cluster)
registerDoSEQ()
Now the dput from train data.frame
structure(list(x = c(-49.2180036048647, -49.588646107472, -49.4994660957961,
-49.409070720487, -49.5901102247847, -49.408915914575), y = c(-28.8051270000448,
-28.7079195821462, -28.7107590910968, -28.7091665158844, -28.6199868803577,
-28.6218794939721), ua = c("ua_35", "ua_39", "ua_40", "ua_41",
"ua_47", "ua_49"), occ = structure(c(1L, 1L, 1L, 1L, 1L, 1L), levels = c("C0",
"C1"), class = "factor"), PC1 = c(5.45050585867435, 0.971417276490495,
3.04696429464962, 3.49333347532713, -2.1314970593002, 1.60231066244416
), PC2 = c(1.62129971834298, -2.67253316161164, 0.803381999088846,
1.24449786054891, -6.62041787415885, -1.15464748692714), PC3 = c(0.677239125000311,
-0.800550473360275, -0.666475036424968, -0.386198549623231, -0.197769597835757,
-0.445143448591713), PC4 = c(-4.01008804477917, 1.84301040539535,
0.967638087266157, 0.0781875925367184, 0.994999464193385, 1.58963654917174
), PC5 = c(0.785837462504693, 0.0665561890147296, -0.72701888727977,
-0.739597425424334, 0.712329685720884, -0.351229249996707), block = c(1L,
1L, 1L, 1L, 1L, 1L)), row.names = c(10L, 12L, 13L, 14L, 15L,
16L), class = "data.frame")
There are no NAs in my data, I double-checked and also ran previous models with the same data. The issue is only with method = ''bartMachine''
Related
I would like to run a statistic test across samples, but each sample has one measurement only.
My data frame is the following:
structure(list(Value = c(1.04, 1.48, 0.3, 0.5, 0.66, 0.99, 0.65,
0.62), Samples = c("S1", "S2", "S3", "S4", "S5", "S6", "S7",
"S8"), Concentration = c(20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L
)), class = "data.frame", row.names = c(NA, -8L))
Here are the codes I ran:
library(ggplot2)
library(dplyr)
library(combinat)
Data = read.csv("Stackoverflow_2023-01-29.csv", header = TRUE
p = ggbarplot
(Data, x = "Samples", y = "Value",color = "Samples", fill = "Samples")
p
new_list <- list()
new_list
x<- unique(Data$Samples)
x
m<- combn(x, 2)
m
for(i in 1:ncol(m)){
new_list[[i]] <- m[,i]
}
new_list
my_comparison <- new_list
my_comparison
p1<- p+ stat_compare_means(comparisons = my_comparison)
p1` [enter image description here][1]
The plot that I obtained is attached ad jpg
enter image description here
I would appreciate any help with the statistics. Many thanks.
I need to create a graph like this which have two relationships, continent-country, country-city. I have 3 columns: city, country, continent, but not sure how to get it into this graph.
Below is an example of another graph with only two columns, country & city. metro_name is the city.
metro = spark.read.csv("metro.csv", header='true').withColumnRenamed("name","metro_name")
country = spark.read.csv("country.csv", header='true').withColumnRenamed("name","country_name")
continent = spark.read.csv("continent.csv", header='true').withColumnRenamed("name","continent_name")
metro_country = spark.read.csv("metro_country.csv", header='true')
country_continent = spark.read.csv("country_continent.csv", header='true')
mc_vertices = country.select(col("country_name").alias("id"))
mc_edges = country.join(metro_country, country.country_id == metro_country.country_id).join(metro, metro_country.metro_id == metro.metro_id).select(col("country_name").alias("src"),col("metro_name").alias("dst"))
mc = GraphFrame(mc_vertices, mc_edges)
# display graph
import networkx as nx
mc_gp = nx.from_pandas_edgelist(mc.edges.toPandas(),'src','dst')
nx.draw(mc_gp, with_labels = True, node_size = 12, font_size = 12, edge_color = "red")
I have tried:
# gets graph that has country, city, continent
mcc_vertices = mc_vertices
mcc_edges = mc_edges.join(cc_edges, mc_edges.src == cc_edges.src).select(mc_edges["src"],mc_edges["dst"],cc_edges["dst"].alias("continent_name"))
mcc = GraphFrame(mcc_vertices, mcc_edges)
# display the graph
mcc_gp = nx.from_pandas_edgelist(mcc.edges.toPandas(),'continent_name','src','dst')
nx.draw(mcc_gp, with_labels = True, node_size = 12, font_size = 12, edge_color = "red")
# gets graph that only has "North America"
northamerica_vertices = mcc_edges.filter(mcc_edges.continent_name == "North America").select(col("src").alias("id")).distinct()
northamerica_edges = mcc_edges.filter(mcc_edges.continent_name == "North America")
northamerica = GraphFrame(northamerica_vertices, northamerica_edges)
northamerica_gp = nx.from_pandas_edgelist(northamerica.edges.toPandas(),'src','dst')
nx.draw(northamerica_gp, with_labels = True, node_size = 40, font_size = 10, edge_color = "red")
I Used SMOTE and Tomek methods for imbalanced classes that I have. I'm trying to do boosted regression tree.
It runs smoothly until I create the confusion matrix I have this error (
Error: data and reference should be factors with the same levels.
### SMOTE and Tomek
NOAA_SMOTE= read.csv("NOAA_SMOTE.csv", TRUE, ",")
train.index <- createDataPartition(NOAA_SMOTE$japon, p = .7, list = FALSE)
train <- NOAA_SMOTE[ train.index,]
test <- NOAA_SMOTE[-train.index,]
tomek = ubTomek(train[,-1], train[,1])
model_train_tomek = cbind(tomek$X,tomek$Y)
names(model_train_tomek)[1] = "japon"
removed.index = tomek$id.rm
train$japon = as.factor(train$japon)
train_tomek = train[-removed.index,]
## SMOTE after tomek links
traintomeksmote <- SMOTE(japon ~ ., train_tomek, perc.over = 2000,perc.under = 100)
fitControlSmoteTomek<- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 3,
## Estimate class probabilities
classProbs = TRUE,
## Evaluate performance using
## the following function
summaryFunction = twoClassSummary)
gbmGridSmoteTomek <- expand.grid(interaction.depth = c(3,4, 5, 6),
n.trees = (1:30)*50,
shrinkage = c(0.1,0.001,0.75,0.0001),
n.minobsinnode = 10)
gbmFitNOAASMOTETomek <- caret::train (make.names(japon) ~ ., data = traintomeksmote,
method = "gbm",
trControl = fitControlSmoteTomek,
distribution = "bernoulli",
verbose = FALSE,
tuneGrid = gbmGridSmoteTomek,
bag.fraction=0.5,
## Specify which metric to optimize
metric = "ROC")
test$japon = as.factor(test$japon)
PredNOAASMOTETomek <- predict(gbmFitNOAASMOTETomek, newdata= test ,type='prob')
cmSMOTETomekNOAA = confusionMatrix(PredNOAASMOTETomek , as.factor(test$japon), mode="everything")
part of the data
[enter image description here](https://i.stack.imgur.com/jPgI9.png)
I’m having issues with the following code (I’ve cut out large pieces but I can add them back in – these seemed like the important parts). In my main code, I set up a plot (“sectionizePlot”) which is a simple variation on another whiskered-plot
I’m looking to update them on the fly. In the same script, I’m using a heatmap (“ModifiedGenericHeatMap”) which updates fine.
Any ideas how I might update my whiskered-plot? Updating the ColumnDataSource doesn’t seem to work (which makes sense). I’m guessing that I am running into issues with adding each circle/point individually onto the plot.
One idea would be to clear the plot each time and manually add the points onto the plot, but it would need to be cleared each time, which I’m unsure of how to do.
Any help would be appreciated. I’m just a lowly Scientist trying to utilize Bokeh in Pharma research.
def ModifiedgenericHeatMap(source, maxPct):
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
#mapper = LinearColorMapper(palette=colors, low=0, high=data['count'].max())
mapper = LinearColorMapper(palette=colors, low=0, high=maxPct)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
globalDist = figure(title="derp",
x_range=cols, y_range=list(reversed(rows)),
x_axis_location="above", plot_width=1000, plot_height=400,
tools=TOOLS, toolbar_location='below')
globalDist.grid.grid_line_color = None
globalDist.axis.axis_line_color = None
globalDist.axis.major_tick_line_color = None
globalDist.axis.major_label_text_font_size = "5pt"
globalDist.axis.major_label_standoff = 0
globalDist.xaxis.major_label_orientation = pi / 3
globalDist.rect(x="cols", y="rows", width=1, height=1,
source=source,
fill_color={'field': 'count', 'transform': mapper},
line_color=None)
color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="5pt",
ticker=BasicTicker(desired_num_ticks=len(colors)),
# fix this via using a formatter with accounts for
formatter=PrintfTickFormatter(format="%d%%"),
label_standoff=6, border_line_color=None, location=(0, 0))
text_props = {"source": source, "text_align": "left", "text_baseline": "middle"}
x = dodge("cols", -0.4, range=globalDist.x_range)
r = globalDist.text(x=x, y=dodge("rows", 0.3, range=globalDist.y_range), text="count", **text_props)
r.glyph.text_font_size = "8pt"
globalDist.add_layout(color_bar, 'right')
globalDist.select_one(HoverTool).tooltips = [
('Well:', '#rows #cols'),
('Count:', '#count'),
]
return globalDist
def sectionizePlot(source, source_error, type, base):
print("sectionize plot created with typ: " + type)
colors = []
for x in range(0, len(base)):
colors.append(getRandomColor())
title = type + "-wise Intensity Distribution"
p = figure(plot_width=600, plot_height=300, title=title)
p.add_layout(
Whisker(source=source_error, base="base", upper="upper", lower="lower"))
for i, sec in enumerate(source.data['base']):
p.circle(x=source_error.data["base"][i], y=sec, color=colors[i])
p.xaxis.axis_label = type
p.yaxis.axis_label = "Intensity"
if (type.split()[-1] == "Row"):
print("hit a row")
conv = dict(enumerate(list("nABCDEFGHIJKLMNOP")))
conv.pop(0)
p.xaxis.major_label_overrides = conv
p.xaxis.ticker = SingleIntervalTicker(interval=1)
return p
famData = dict()
e1FractSource = ColumnDataSource(dict(count=[], cols=[], rows=[], index=[]))
e1Fract = ModifiedgenericHeatMap(e1FractSource, 100)
rowSectTotSource = ColumnDataSource(data=dict(base=[]))
rowSectTotSource_error = ColumnDataSource(data=dict(base=[], lower=[], upper=[]))
rowSectPlot_tot = sectionizePlot(rowSectTotSource,rowSectTotSource_error, "eSum Row", rowBase)
def update(selected=None):
global famData
famData = getFAMData(file_source_dt1, True)
global e1Stack
e1Fract = (famData['e1Sub'] / famData['eSum']) * 100
e1Stack = e1Fract.stack(dropna=False).reset_index()
e1Stack.columns = ["rows", "cols", "count"]
e1Stack['count'] = e1Stack['count'].apply(lambda x: round(x, 1))
e1FractSource.data = dict(cols=e1Stack["cols"], count=(e1Stack["count"]),
rows=e1Stack["rows"], index=e1Stack.index.values, codon=wells, )
rowData, colData = sectionize(famData['eSum'], rows, cols)
rowData_lower, rowData_upper = getLowerUpper(rowData)
rowBase = list(range(1, 17))
rowSectTotSource_error.data = dict(base=rowBase, lower=rowData_lower, upper=rowData_upper, )
rowSectTotSource.data = dict(base=rowData)
rowSectPlot_tot.title.text = "plot changed in update"
layout = column(e1FractSource, rowSectPlot_tot)
update()
curdoc().add_root(layout)
curdoc().title = "Specs"
print("ok")
I have some data,
calvarbyruno.1<-structure(list(Nominal = c(1, 3, 6, 10, 30, 50, 150, 250), Run = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3"), class = "factor"),
PAR = c(1.25000000000000e-05, 0.000960333333333333, 0.00205833333333334,
0.00423333333333333, 0.0322333333333334, 0.614433333333334,
1.24333333333333, 1.86333333333333), PredLin = c(-0.0119152187070942,
0.00375925114245899, 0.0272709559167888, 0.0586198956158952,
0.215364594111427, 0.372109292606959, 1.15583278508462, 1.93955627756228
), PredQuad = c(-0.0615895732702735, -0.0501563307416599,
-0.0330831368244257, -0.0104619953693943, 0.100190275883806,
0.20675348710041, 0.6782336426345, 1.04748729725370)), .Names = c("Nominal",
"Run", "PAR", "PredLin", "PredQuad"), row.names = c(NA, 8L), class = "data.frame")
calweight <- -2
for which I've created both a linear and a quadratic lm model
callin.1<-lm(PAR~Nominal,data=calvarbyruno.1,weight=Nominal^calweight)
calquad.1<-lm(PAR~Nominal+I(Nominal^2),data=calvarbyruno.1,weight=Nominal^calweight)
I can then plot my data values using ggplot2
qplot(PAR,Nominal,data=calvarbyruno.1)
But can't work out how to overlay a line representing the two lm objects... Any ideas ?
The easiest option is to use geom_smooth() and let ggplot2 fit the model for you.
ggplot(calvarbyruno.1, aes(y = PAR, x = Nominal, weight=Nominal^calweight)) +
geom_smooth(method = "lm") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), colour = "red") +
geom_point() +
coord_flip()
Or you can create a new dataset with the predicted values.
newdata <- data.frame(Nominal = pretty(calvarbyruno.1$Nominal, 100))
newdata$Linear <- predict(callin.1, newdata = newdata)
newdata$Quadratic <- predict(calquad.1, newdata = newdata)
require(reshape2)
newdata <- melt(newdata, id.vars = "Nominal", variable.name = "Model")
ggplot(calvarbyruno.1, aes(x = PAR, y = Nominal, weight=Nominal^calweight)) +
geom_line(data = newdata, aes(x = value, colour = Model)) +
geom_point()
Earlier I asked a related question and Hadley had this good answer. Using the predict function from that post you can add two columns to your data. One for each model:
calvarbyruno.1$calQuad <- predict(calquad.1)
calvarbyruno.1$callin <- predict(callin.1)
Then it's a matter of plotting the point and adding each model in as a line:
ggplot() +
geom_point(data=calvarbyruno.1, aes(PAR, Nominal), colour="green") +
geom_line(data=calvarbyruno.1, aes(calQuad, Nominal), colour="red" ) +
geom_line(data=calvarbyruno.1, aes(callin, Nominal), colour="blue" ) +
opts(aspect.ratio = 1)
And that results in this nice picture (yeah the colors could use some work):
(source: cerebralmastication.com)