Marcin's Developer Notes - notes and pro tips for R

R language learning basics

For loop

my_vector <- c(1,2,3,4,5)
for(i in seq_along(my_vector)){
    my_vector[i] #do shit
}

next (continue in english :D) skips the current index in loop

Foreach loop

my_vector <- c("a","b","c")
for(elem in my_vector){
    print(elem)
}

#prints
#"a"
#"b"
#"c"

Hashtable (uber fast lookup by string)

lookupReplacement <- new.env()
for(ridx in 1:nrow(replacementData)){
    key <- as.character(unlist(replacementData[ridx,"recr"]))
    lookupReplacement[[key]] <- replacementData[ridx,]
}

Normalize all columns (with dynamic name)

Dynamically runs throught the columns and applies dplyr mutate_ (with underscore at the end, because this version allows to have colmn names as strings #awesome)

normalized <- not_yet_normalized_dataset
for(col in colnames(normalized)){
  if(col == "ID")next #skip ID columns
  normalized <- mutate_(normalized,.dots=setNames(paste0("(",col,"-min(",col,"))/(max(",col,")-min(",col,"))"),col))
}

Install caret package with all dependencies

install.packages("caret", dependencies = c("Depends", "Suggests"))

and go make some coffee, becuase it will take some time :-)

Factorize (numerical to categorical) column

data[,"Pclass"] <- factor((data %>% select(Pclass))[,"Pclass"])

Scale 0-1 01 normalize normalization

scale01 <- function(df, column, x) {
    min_ = min(df[, column])
    max_ = max(df[, column])
    return(function(x) {
        return((x-min_)/(max_-min_))
        })
 }

Usage:

d3 <- d2 %>% mutate(nFare=scale01(d2,"Fare",Fare)(Fare), nAge=scale01(d2,"Age",Age)(Age))

Generate bins and mutate column to identify bin (bucketize / manual histogram)

generate_bins <- function(vector, width) {
    min_ = min(vector)
    max_ = max(vector)
    bins = seq(min_, max_ + width, by = width)
    return(bins)
 }

which_bin <- function(df, column, width) {
    bins_ = generate_bins(df[, column], width)
    return(function(x) {
        return(sapply(x, function(xx) { 
          first(which(bins_>xx))
        }))
    })
}

Reading & writing files

Write CSV to string (variable)

tc <- textConnection("raw_csv", "w")
write.csv(data2, tc, quote=FALSE, row.names=FALSE)
data2_copy <- read.csv(text=raw_csv, header=TRUE)

dplyr & friends

For data wrangling.

Order by sum of group

example_data <- read.csv(text='"idx","source_id","item_type","total"
"1","aa254dfd-1651-48e8-ac0d-81714129f19b","type1",1
"2","aa254dfd-1651-48e8-ac0d-81714129f19b","type2",1
"3","aa254dfd-1651-48e8-ac0d-81714129f19b","type3",1
"4","fc1a3456-2116-4b21-b540-72f8331c4a04","type1",3
"5","fc1a3456-2116-4b21-b540-72f8331c4a04","type2",4
"6","fc1a3456-2116-4b21-b540-72f8331c4a04","type3",4
"7","def9b096-294b-47b9-bf93-f829e5bb46cf","type1",2
"8","def9b096-294b-47b9-bf93-f829e5bb46cf","type2",3
"9","def9b096-294b-47b9-bf93-f829e5bb46cf","type3",3
"10","db58eddb-3e15-4d56-b0e4-8e464c68d549","type1",9
"11","db58eddb-3e15-4d56-b0e4-8e464c68d549","type2",9
"12","db58eddb-3e15-4d56-b0e4-8e464c68d549","type3",100', header=TRUE)

grouped_by_sum <- example_data %>% group_by(source_id) %>% mutate(t_sum=sum(total)) %>% arrange(-t_sum) %>% select(item_type, total)

grouped_by_sum

Transpose columns (from wide format to long format):

a.k.a unroll, pivot Input.csv:

"name","ts","tf","tn","tns","perc","perc_s","perc_t"
"Fizz Corp.",149068,5324659,849861,4325730,0.0279957833919505,0.0272333640314908,0.972766635968509
"Communication",145,530136,221510,308481,0.000273514720750902,0.000273439930904558,0.999726560069095
"buzz",8336,2327614,741885,1577393,0.00358134982862279,0.00356856953273829,0.996431430467262
"xd",1747,328034,39548,286739,0.00532566746129974,0.00529745497769732,0.994702545022303
"cpplus",2461,2487068,1495380,989227,0.000989518581719519,0.000988540402622343,0.999011459597378
"CHGW",9962,14634,4443,229,0.680743474101408,0.405025207350789,0.594974792649211

Code:

library(tidyr)
gather(data5, "type", "value", c("perc_s", "perc_t"))

Output.csv:

"name","ts","tf","tn","tns","perc","type","value"
"Fizz Corp.",149068,5324659,849861,4325730,0.0279957833919505,"perc_s",0.0272333640314908
"Fizz Corp.",149068,5324659,849861,4325730,0.0279957833919505,"perc_t",0.972766635968509
"Communication",145,530136,221510,308481,0.000273514720750902,"perc_s",0.000273439930904558
"Communication",145,530136,221510,308481,0.000273514720750902,"perc_t",0.999726560069095
"buzz",8336,2327614,741885,1577393,0.00358134982862279,"perc_s",0.00356856953273829
"buzz",8336,2327614,741885,1577393,0.00358134982862279,"perc_t",0.996431430467262
"xd",1747,328034,39548,286739,0.00532566746129974,"perc_s",0.00529745497769732
"xd",1747,328034,39548,286739,0.00532566746129974,"perc_t",0.994702545022303
"cpplus",2461,2487068,1495380,989227,0.000989518581719519,"perc_s",0.000988540402622343
"cpplus",2461,2487068,1495380,989227,0.000989518581719519,"perc_t",0.999011459597378
"CHGW",9962,14634,4443,229,0.680743474101408,"perc_s",0.405025207350789
"CHGW",9962,14634,4443,229,0.680743474101408,"perc_t",0.594974792649211

Drop column in DataFrame

Drops column named yolo

df <- subset(df, select = -c(yolo))

filter IN (SQL IN operator)

library(dplyr)
filter(data, my_column %in% c("Fuel", "Fire", "Desire"))

filter NOT IN

library(dplyr)
filter(data, !my_column %in% c("Fuel", "Fire", "Desire"))

map column by applying function (map / select function)

data$column <- lapply(data$column, function(x){return(x*2)})

Sometimes, when you want to operate on strings:

data$column <- lapply(as.character(data$column), function(x){return(paste0(x,"_yolo"))})

ggplot2

Legend label

If aesthetics are using different colors, use color="Label". If using different fills, use fill="Label".

+ labs(color="Metric")

Custom x axis intervals (axis breaks)

 + scale_x_continuous(breaks=seq(0, max(data$column), by=500))

Plot multiple lines on the same chart

It's better to melt dataframe, so that instead of multiple columns with values, you will have multiple rows with "category" column. It's like table before group by operation. Full example:

Input table, with multiple columns:

avg.occurences avg.per.file avg.wo.p95 files median.occurences percentile.95 total  type unique
1       1.455900    109.44000   1.198622   100                 1             3 10944 names   7517
2       1.500763     88.50000   1.194457   200                 1             3 17700 names  11794
3       1.421497     90.32667   1.168665   300                 1             3 27098 names  19063
4       1.416198     81.13750   1.170480   400                 1             3 32455 names  22917
5       1.382586     86.34800   1.148838   500                 1             3 43174 names  31227
6       1.396559     80.08333   1.146464   600                 1             3 48050 names  34406

Melting the dataframe:

library(reshape2)
melted <- melt(data, id=c("files", "type"))

Result of this operation is the following:

  files  type          variable        value
1    100 names    avg.occurences     1.455900
2    200 names    avg.occurences     1.500763
3    300 names    avg.occurences     1.421497
4    400 names    avg.occurences     1.416198
5    500 names    avg.occurences     1.382586
6    600 names    avg.occurences     1.396559
7    100 names      avg.per.file   109.440000
8    200 names      avg.per.file    88.500000
9    300 names      avg.per.file    90.326667
10   400 names      avg.per.file    81.137500
11   500 names      avg.per.file    86.348000
12   600 names      avg.per.file    80.083333
13   100 names        avg.wo.p95     1.198622
14   200 names        avg.wo.p95     1.194457
15   300 names        avg.wo.p95     1.168665
16   400 names        avg.wo.p95     1.170480
17   500 names        avg.wo.p95     1.148838
18   600 names        avg.wo.p95     1.146464
19   100 names median.occurences     1.000000
20   200 names median.occurences     1.000000
21   300 names median.occurences     1.000000
22   400 names median.occurences     1.000000
23   500 names median.occurences     1.000000
24   600 names median.occurences     1.000000
25   100 names     percentile.95     3.000000
26   200 names     percentile.95     3.000000
27   300 names     percentile.95     3.000000
28   400 names     percentile.95     3.000000
29   500 names     percentile.95     3.000000
30   600 names     percentile.95     3.000000
31   100 names             total 10944.000000
32   200 names             total 17700.000000
33   300 names             total 27098.000000
34   400 names             total 32455.000000
35   500 names             total 43174.000000
36   600 names             total 48050.000000
37   100 names            unique  7517.000000
38   200 names            unique 11794.000000
39   300 names            unique 19063.000000
40   400 names            unique 22917.000000
41   500 names            unique 31227.000000
42   600 names            unique 34406.000000

Now, this data can be easily plotted on the same graph using ggplot:

ggplot(melted, aes(x=files)) + geom_line(aes(y=value, color=variable)) + ylim(min=0, max=max(for_plot$value)) + 
    xlab("Number of files")+labs(color="Metric") + scale_x_continuous(breaks=seq(0, max(data$files), by=100)) +
    ggtitle("Occurrences of person entities") +
    theme(legend.position="bottom")

Rotate labels

+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

Flip coords

+ coord_flip()

Sort geom bars by column

ggplot(filter(df, occurrences > 100), aes(x=reorder(pattern, occurrences))) + geom_bar(aes(y=occurrences), stat="identity") + coord_flip()

Data science

Precision at, recall at, accuracy at

precision_at <- function(at_){
    return(sum(data[1:at_, ]$predicted1_true1)/(sum(data[1:at_,]$predicted1_true1) + sum(data[1:at_, ]$predicted1_true0)))
}

recall_at <- function(at_){return(sum(data[1:at_, ]$predicted1_true1)/sum(data[1:at_,]$class))}

accuracy_at <- function(at_){return(sum(data[1:at_, ]$correct)/nrow(data[1:at_,]))}

No matches...