R language learning basics
For loop
my_vector <- c(1,2,3,4,5)
for(i in seq_along(my_vector)){
my_vector[i] #do shit
}
next
(continue in english :D) skips the current index in loop
Foreach loop
my_vector <- c("a","b","c")
for(elem in my_vector){
print(elem)
}
#prints
#"a"
#"b"
#"c"
Hashtable (uber fast lookup by string)
lookupReplacement <- new.env()
for(ridx in 1:nrow(replacementData)){
key <- as.character(unlist(replacementData[ridx,"recr"]))
lookupReplacement[[key]] <- replacementData[ridx,]
}
Normalize all columns (with dynamic name)
Dynamically runs throught the columns and applies dplyr mutate_ (with underscore at the end, because this version allows to have colmn names as strings #awesome)
normalized <- not_yet_normalized_dataset
for(col in colnames(normalized)){
if(col == "ID")next #skip ID columns
normalized <- mutate_(normalized,.dots=setNames(paste0("(",col,"-min(",col,"))/(max(",col,")-min(",col,"))"),col))
}
Install caret package with all dependencies
install.packages("caret", dependencies = c("Depends", "Suggests"))
and go make some coffee, becuase it will take some time :-)
Factorize (numerical to categorical) column
data[,"Pclass"] <- factor((data %>% select(Pclass))[,"Pclass"])
Scale 0-1 01 normalize normalization
scale01 <- function(df, column, x) {
min_ = min(df[, column])
max_ = max(df[, column])
return(function(x) {
return((x-min_)/(max_-min_))
})
}
Usage:
d3 <- d2 %>% mutate(nFare=scale01(d2,"Fare",Fare)(Fare), nAge=scale01(d2,"Age",Age)(Age))
Generate bins and mutate column to identify bin (bucketize / manual histogram)
generate_bins <- function(vector, width) {
min_ = min(vector)
max_ = max(vector)
bins = seq(min_, max_ + width, by = width)
return(bins)
}
which_bin <- function(df, column, width) {
bins_ = generate_bins(df[, column], width)
return(function(x) {
return(sapply(x, function(xx) {
first(which(bins_>xx))
}))
})
}
Reading & writing files
Write CSV to string (variable)
tc <- textConnection("raw_csv", "w")
write.csv(data2, tc, quote=FALSE, row.names=FALSE)
data2_copy <- read.csv(text=raw_csv, header=TRUE)
dplyr & friends
For data wrangling.
Order by sum of group
example_data <- read.csv(text='"idx","source_id","item_type","total"
"1","aa254dfd-1651-48e8-ac0d-81714129f19b","type1",1
"2","aa254dfd-1651-48e8-ac0d-81714129f19b","type2",1
"3","aa254dfd-1651-48e8-ac0d-81714129f19b","type3",1
"4","fc1a3456-2116-4b21-b540-72f8331c4a04","type1",3
"5","fc1a3456-2116-4b21-b540-72f8331c4a04","type2",4
"6","fc1a3456-2116-4b21-b540-72f8331c4a04","type3",4
"7","def9b096-294b-47b9-bf93-f829e5bb46cf","type1",2
"8","def9b096-294b-47b9-bf93-f829e5bb46cf","type2",3
"9","def9b096-294b-47b9-bf93-f829e5bb46cf","type3",3
"10","db58eddb-3e15-4d56-b0e4-8e464c68d549","type1",9
"11","db58eddb-3e15-4d56-b0e4-8e464c68d549","type2",9
"12","db58eddb-3e15-4d56-b0e4-8e464c68d549","type3",100', header=TRUE)
grouped_by_sum <- example_data %>% group_by(source_id) %>% mutate(t_sum=sum(total)) %>% arrange(-t_sum) %>% select(item_type, total)
grouped_by_sum
Transpose columns (from wide format to long format):
a.k.a unroll, pivot Input.csv:
"name","ts","tf","tn","tns","perc","perc_s","perc_t"
"Fizz Corp.",149068,5324659,849861,4325730,0.0279957833919505,0.0272333640314908,0.972766635968509
"Communication",145,530136,221510,308481,0.000273514720750902,0.000273439930904558,0.999726560069095
"buzz",8336,2327614,741885,1577393,0.00358134982862279,0.00356856953273829,0.996431430467262
"xd",1747,328034,39548,286739,0.00532566746129974,0.00529745497769732,0.994702545022303
"cpplus",2461,2487068,1495380,989227,0.000989518581719519,0.000988540402622343,0.999011459597378
"CHGW",9962,14634,4443,229,0.680743474101408,0.405025207350789,0.594974792649211
Code:
library(tidyr)
gather(data5, "type", "value", c("perc_s", "perc_t"))
Output.csv:
"name","ts","tf","tn","tns","perc","type","value"
"Fizz Corp.",149068,5324659,849861,4325730,0.0279957833919505,"perc_s",0.0272333640314908
"Fizz Corp.",149068,5324659,849861,4325730,0.0279957833919505,"perc_t",0.972766635968509
"Communication",145,530136,221510,308481,0.000273514720750902,"perc_s",0.000273439930904558
"Communication",145,530136,221510,308481,0.000273514720750902,"perc_t",0.999726560069095
"buzz",8336,2327614,741885,1577393,0.00358134982862279,"perc_s",0.00356856953273829
"buzz",8336,2327614,741885,1577393,0.00358134982862279,"perc_t",0.996431430467262
"xd",1747,328034,39548,286739,0.00532566746129974,"perc_s",0.00529745497769732
"xd",1747,328034,39548,286739,0.00532566746129974,"perc_t",0.994702545022303
"cpplus",2461,2487068,1495380,989227,0.000989518581719519,"perc_s",0.000988540402622343
"cpplus",2461,2487068,1495380,989227,0.000989518581719519,"perc_t",0.999011459597378
"CHGW",9962,14634,4443,229,0.680743474101408,"perc_s",0.405025207350789
"CHGW",9962,14634,4443,229,0.680743474101408,"perc_t",0.594974792649211
Drop column in DataFrame
Drops column named yolo
df <- subset(df, select = -c(yolo))
filter IN (SQL IN operator)
library(dplyr)
filter(data, my_column %in% c("Fuel", "Fire", "Desire"))
filter NOT IN
library(dplyr)
filter(data, !my_column %in% c("Fuel", "Fire", "Desire"))
map column by applying function (map / select function)
data$column <- lapply(data$column, function(x){return(x*2)})
Sometimes, when you want to operate on strings:
data$column <- lapply(as.character(data$column), function(x){return(paste0(x,"_yolo"))})
ggplot2
Legend label
If aesthetics are using different colors, use color="Label"
. If using different fills, use fill="Label"
.
+ labs(color="Metric")
Custom x axis intervals (axis breaks)
+ scale_x_continuous(breaks=seq(0, max(data$column), by=500))
Plot multiple lines on the same chart
It's better to melt dataframe, so that instead of multiple columns with values, you will have multiple rows with "category" column. It's like table before group by operation. Full example:
-
Input table, with multiple columns:
avg.occurences avg.per.file avg.wo.p95 files median.occurences percentile.95 total type unique 1 1.455900 109.44000 1.198622 100 1 3 10944 names 7517 2 1.500763 88.50000 1.194457 200 1 3 17700 names 11794 3 1.421497 90.32667 1.168665 300 1 3 27098 names 19063 4 1.416198 81.13750 1.170480 400 1 3 32455 names 22917 5 1.382586 86.34800 1.148838 500 1 3 43174 names 31227 6 1.396559 80.08333 1.146464 600 1 3 48050 names 34406
-
Melting the dataframe:
library(reshape2) melted <- melt(data, id=c("files", "type"))
Result of this operation is the following:
files type variable value 1 100 names avg.occurences 1.455900 2 200 names avg.occurences 1.500763 3 300 names avg.occurences 1.421497 4 400 names avg.occurences 1.416198 5 500 names avg.occurences 1.382586 6 600 names avg.occurences 1.396559 7 100 names avg.per.file 109.440000 8 200 names avg.per.file 88.500000 9 300 names avg.per.file 90.326667 10 400 names avg.per.file 81.137500 11 500 names avg.per.file 86.348000 12 600 names avg.per.file 80.083333 13 100 names avg.wo.p95 1.198622 14 200 names avg.wo.p95 1.194457 15 300 names avg.wo.p95 1.168665 16 400 names avg.wo.p95 1.170480 17 500 names avg.wo.p95 1.148838 18 600 names avg.wo.p95 1.146464 19 100 names median.occurences 1.000000 20 200 names median.occurences 1.000000 21 300 names median.occurences 1.000000 22 400 names median.occurences 1.000000 23 500 names median.occurences 1.000000 24 600 names median.occurences 1.000000 25 100 names percentile.95 3.000000 26 200 names percentile.95 3.000000 27 300 names percentile.95 3.000000 28 400 names percentile.95 3.000000 29 500 names percentile.95 3.000000 30 600 names percentile.95 3.000000 31 100 names total 10944.000000 32 200 names total 17700.000000 33 300 names total 27098.000000 34 400 names total 32455.000000 35 500 names total 43174.000000 36 600 names total 48050.000000 37 100 names unique 7517.000000 38 200 names unique 11794.000000 39 300 names unique 19063.000000 40 400 names unique 22917.000000 41 500 names unique 31227.000000 42 600 names unique 34406.000000
-
Now, this data can be easily plotted on the same graph using
ggplot
:ggplot(melted, aes(x=files)) + geom_line(aes(y=value, color=variable)) + ylim(min=0, max=max(for_plot$value)) + xlab("Number of files")+labs(color="Metric") + scale_x_continuous(breaks=seq(0, max(data$files), by=100)) + ggtitle("Occurrences of person entities") + theme(legend.position="bottom")
Rotate labels
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
Flip coords
+ coord_flip()
Sort geom bars by column
ggplot(filter(df, occurrences > 100), aes(x=reorder(pattern, occurrences))) + geom_bar(aes(y=occurrences), stat="identity") + coord_flip()
Data science
Precision at, recall at, accuracy at
precision_at <- function(at_){
return(sum(data[1:at_, ]$predicted1_true1)/(sum(data[1:at_,]$predicted1_true1) + sum(data[1:at_, ]$predicted1_true0)))
}
recall_at <- function(at_){return(sum(data[1:at_, ]$predicted1_true1)/sum(data[1:at_,]$class))}
accuracy_at <- function(at_){return(sum(data[1:at_, ]$correct)/nrow(data[1:at_,]))}