R | Data Science Notebook

Description	R	Comments	Python
exam type	class()		type()
vector	c(...)	one-dimension arrays same type
name vector	names(vect)<-c(...)
slicing vector	vect[3] vect[c(3,5,6)]	starting from 1 compares to 0 in Python
2:5	!ERROR! unexpected operator '='	includes 5
vect[3:5]	!ERROR! illegal character '['
use names as index	vec[c('name1','name2',...)]
calculate average	mean()	in Python, have to import other libraries	np.mean()
vectors comparison	c(2,3,4,5)>3	in Python, have to in numpy, pandas
logical selection	vect[c(...)>n] vect[vect2(logical)]	in Python, pandas is common
matrix	matrix() matrix(1:9, byrow = TRUE, nrow = 3)	two-dimensional same data type	np.matrix()
Naming a matrix	rownames(my_matrix) <- row_names_vector colnames(my_matrix) <- col_names_vector
	dimnames = list(rowname, columnname)
Sum of values of each row	rowSums(some_matrix)		ndarray.sum(axis=1) df.sum(axis=1)
add column(s) to a matrix	bigger<- cbind(matrix1, matrix2, ...)		pd.concat([df1,df2],axis=1)
Adding a row(s) to a matrix	rbind(matrix1, matrix2, ...)		pd.concat([df1,df2],axis=0) df1.append(df2)
Sum of values of each column			ndarray.sum(axis=0) df.sum(axis=0)
slicing Matrix	matrix[row,col] my_matrix[1,2] my_matrix[1:3,2:4] my_matrix[ ,1] my_matrix[2, ]
factors	factor()	categorical
Convert vector to factor	my_factor<-(vector,
order/ non-order	temp_vector <- c("High", "Low", "High","Low", "Medium") factor_temp_vector <- factor(temp_vector, order = TRUE, levels = c("Low", "Medium", "High"))	nominal categorical variable ordinal categorical variable.	s = pd.Series(["a","b","c","a"], dtype="category") raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], .... ordered=False)
Factor levels	levels() levels(factor_vector) <- c("name1", "name2",...)
summary()	summary(my_var)		df.describe() Series.value_counts()
ordered	factor_speed_vector <-factor(speed_vector,ordered=TRUE,levels=c('slow','fast','insane'))	ordered factor can be compared
data frame	head(df) tail(df)		each column must be same data type
examine structure of a dataframe	str(df)
create data frame	data.frame(vectors)
slicing	df[rows,columns] df[row2,] entire row2 df[,column3] entire column3
use name slicing	df[2:5, 'name'] df['name', ] df[ ,'name']
subset() create	subset(planets_df, diameter<1) == planets_df[planets_df[,'diameter']<1,]
sorting	order() returns ranked index not values values: a[order(a)]
sorting df	indexes=order(df$column3) df[indexes, ]
list	my_list <- list(comp1, comp2 ...)
Creating a named list	my_list <- list(name1 = your_comp1, name2 = your_comp2)
same as above	my_list <- list(your_comp1, your_comp2) names(my_list) <- c("name1", "name2")
selecting elements from a list	shining_list[["reviews"]] == shining_list$reviews
	list[[2]][1]
add data to list	ext_list <- c(my_list , my_val)
comparison	& and \| or ! not	double sign only compares the first element && \|\|
if syntax in R	if (condition) {do sth} else if (condition) {do sth} else {do sth}
read data	read.table read.delim read.csv read.csv2	hotdogs2 <- read.delim("hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"), colClasses = c("factor", "NULL", "numeric"))
check environment	environment(func)
specify func without a name	function(x){x+1}(2) => 3
mean()	mean(c(1:9, NA),trim=0.1,na.rm=TRUE)	trim -> remove outliers
environment	> f<-function () x > x<-99 > f() [1] 99
exists()	a<-5 exists("a") TRUE
vector properties	typeoff() length()
nun value in R	NULL (absent of entire vector) NA (absent of one value in vector)
check nun	is.na()
sequence	seq(1,10) 1:10
merge vector	c(vector1, vector2, singlevalue, ...)
paste() paste0()	paste() sep=" " paste0 sep=""	string.join(list)
paste0("year_", 1:5)	[1] "year_1" "year_2" "year_3" "year_4" "year_5"
plotting	hist(one_dim_data) hist(df$column) boxplot(multi_dim_data) boxplot(df)

Data Science Notebook

R odds and ends R basics