practice | Data Science Notebook

Environment: Python2

Day 0: Mean, Median, and Mode

# Enter your code here. Read input from STDIN. Print output to STDOUT
wtf=raw_input()
wtf2=raw_input()

print wtf,type(wtf)
print wtf2, type(wtf2)

# Your Output (stdout)

10 <type 'str'>
64630 11735 14216 99233 14470 4978 73429 38120 51135 67060 <type 'str'>

my solution

def quartile_1(l):
    return sorted(l)[int(len(l) * .25)]

def median(l):
    return sorted(l)[len(l)/2]

def quartile_3(l):
    return sorted(l)[int(len(l) * .75)]

li=wtf2.split()
ll=[float(i) for i in li]


def mean(x):
    return sum(x)/len(x)

def median(x):
    x.sort()
    while len(x)>2:
        x=x[1:-1]
    return sum(x)/len(x)
    
def mode(x):
 x=[int(i) for i in li]
 dic={}
 for i in x:
 dic[i]=0
 for i in x:
 dic[i]+=1
 m=max(dic.items(), key=lambda x: x[1])[1]
 
 c=[]
 for i in dic.items():
 if i[1]==m:
 c.append(i[0])
 return min(c)
 
print mean(ll)
print median(ll)
print mode(ll)

Environment: R

# Enter your code here. Read input from STDIN. Print output to STDOUT

x <- suppressWarnings(readLines(file("stdin")))
x <- strsplit(x,' ')
x <- lapply(x,as.numeric)[[2]]
#print(x)
print(mean(x))
print(median(x))

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}
print(min(x))

Day 0: Weighted Mean

# Enter your code here. Read input from STDIN. Print output to STDOUT
c=raw_input()
a=raw_input()
b=raw_input()

# Input (stdin)
# 5
# 10 40 30 50 20
# 1 2 3 4 5


a=[float(i) for i in a.split(' ')]
b=[float(i) for i in b.split(' ')]

up = [i*j for i,j in zip(a,b)]

print round(sum(up)/sum(b),1)

Day 1: Quartiles

Sample Input

9
3 7 8 5 12 14 21 13 18

Sample Output

6
12
16

Explanation

Lower half (L): 3, 5, 7, 8

Upper half (U): 13, 14, 18, 21

def quartile_1(l):
    return sorted(l)[int(len(l) * .25)]

def median(l):
    return sorted(l)[len(l)/2]

def quartile_3(l):
    return sorted(l)[int(len(l) * .75)]

# Enter your code here. Read input from STDIN. Print output to STDOUT

y=raw_input()
x=raw_input()

y=int(y)
x=[int(i) for i in x.split(' ')]
x.sort()

if y%2==0:
    print median(x[:x.index(median(x)[1])+1])[0]
else:
    
    print median(x[:x.index(median(x)[1])])[0]
 
print(median(x)[0])
    
if y%2==0:
     print median(x[x.index(median(x)[2]):])[0]
else:
     print median(x[x.index(median(x)[2])+1:])[0]

if y%2==0:
    print median(x[:x.index(median(x)[1])+1])[0]
else:
    
    print median(x[:x.index(median(x)[1])])[0]
 
print(median(x)[0])
    
if y%2==0:
     print median(x[x.index(median(x)[2]):])[0]
else:
     print median(x[x.index(median(x)[2])+1:])[0]

Or

# Enter your code here. Read input from STDIN. Print output to STDOUT


y=raw_input()
x=raw_input()

y=float(y)
x=[int(i) for i in x.split(' ')]
x.sort()

if y%2==1:
    m=round(y/2)
    l=(m-1)/2
    r=m+l
    
    m,l,r=int(m),int(l),int(r)
    
    print (x[l-1]+x[l])/2
    print x[m-1]
    print (x[r-1]+x[r])/2
    
if y%2==0:
    m=y/2
    l=round(m/2)
    r=m+l
    
    m,l,r=int(m),int(l),int(r)

    if m%2==0:
        print (x[l-1]+x[l])/2
        print (x[m-1]+x[m])/2
        print (x[r-1]+x[r])/2
    else:
        print x[l-1]
        print (x[m-1]+x[m])/2
        print x[r-1]

Day 1: Standard Deviation

Sample Input

5
10 40 30 50 20

Sample Output

14.1

# Enter your code here. Read input from STDIN. Print output to STDOUT
y=raw_input()
y=int(y)
x=raw_input()


x=[int(i) for i in x.split(' ')]
x.sort()

m=sum(x)/float(len(x))
square=map(lambda x:(x-m)**2, x)
mu=(sum(square)/len(x))**0.5

print mu

R odds and ends R basics

Description	R	Comments	Python
exam type	class()		type()
vector	c(...)	one-dimension arrays same type
name vector	names(vect)<-c(...)
slicing vector	vect[3] vect[c(3,5,6)]	starting from 1 compares to 0 in Python
2:5	!ERROR! unexpected operator '='	includes 5
vect[3:5]	!ERROR! illegal character '['
use names as index	vec[c('name1','name2',...)]
calculate average	mean()	in Python, have to import other libraries	np.mean()
vectors comparison	c(2,3,4,5)>3	in Python, have to in numpy, pandas
logical selection	vect[c(...)>n] vect[vect2(logical)]	in Python, pandas is common
matrix	matrix() matrix(1:9, byrow = TRUE, nrow = 3)	two-dimensional same data type	np.matrix()
Naming a matrix	rownames(my_matrix) <- row_names_vector colnames(my_matrix) <- col_names_vector
	dimnames = list(rowname, columnname)
Sum of values of each row	rowSums(some_matrix)		ndarray.sum(axis=1) df.sum(axis=1)
add column(s) to a matrix	bigger<- cbind(matrix1, matrix2, ...)		pd.concat([df1,df2],axis=1)
Adding a row(s) to a matrix	rbind(matrix1, matrix2, ...)		pd.concat([df1,df2],axis=0) df1.append(df2)
Sum of values of each column			ndarray.sum(axis=0) df.sum(axis=0)
slicing Matrix	matrix[row,col] my_matrix[1,2] my_matrix[1:3,2:4] my_matrix[ ,1] my_matrix[2, ]
factors	factor()	categorical
Convert vector to factor	my_factor<-(vector,
order/ non-order	temp_vector <- c("High", "Low", "High","Low", "Medium") factor_temp_vector <- factor(temp_vector, order = TRUE, levels = c("Low", "Medium", "High"))	nominal categorical variable ordinal categorical variable.	s = pd.Series(["a","b","c","a"], dtype="category") raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], .... ordered=False)
Factor levels	levels() levels(factor_vector) <- c("name1", "name2",...)
summary()	summary(my_var)		df.describe() Series.value_counts()
ordered	factor_speed_vector <-factor(speed_vector,ordered=TRUE,levels=c('slow','fast','insane'))	ordered factor can be compared
data frame	head(df) tail(df)		each column must be same data type
examine structure of a dataframe	str(df)
create data frame	data.frame(vectors)
slicing	df[rows,columns] df[row2,] entire row2 df[,column3] entire column3
use name slicing	df[2:5, 'name'] df['name', ] df[ ,'name']
subset() create	subset(planets_df, diameter<1) == planets_df[planets_df[,'diameter']<1,]
sorting	order() returns ranked index not values values: a[order(a)]
sorting df	indexes=order(df$column3) df[indexes, ]
list	my_list <- list(comp1, comp2 ...)
Creating a named list	my_list <- list(name1 = your_comp1, name2 = your_comp2)
same as above	my_list <- list(your_comp1, your_comp2) names(my_list) <- c("name1", "name2")
selecting elements from a list	shining_list[["reviews"]] == shining_list$reviews
	list[[2]][1]
add data to list	ext_list <- c(my_list , my_val)
comparison	& and \| or ! not	double sign only compares the first element && \|\|
if syntax in R	if (condition) {do sth} else if (condition) {do sth} else {do sth}
read data	read.table read.delim read.csv read.csv2	hotdogs2 <- read.delim("hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"), colClasses = c("factor", "NULL", "numeric"))
check environment	environment(func)
specify func without a name	function(x){x+1}(2) => 3
mean()	mean(c(1:9, NA),trim=0.1,na.rm=TRUE)	trim -> remove outliers
environment	> f<-function () x > x<-99 > f() [1] 99
exists()	a<-5 exists("a") TRUE
vector properties	typeoff() length()
nun value in R	NULL (absent of entire vector) NA (absent of one value in vector)
check nun	is.na()
sequence	seq(1,10) 1:10
merge vector	c(vector1, vector2, singlevalue, ...)
paste() paste0()	paste() sep=" " paste0 sep=""	string.join(list)
paste0("year_", 1:5)	[1] "year_1" "year_2" "year_3" "year_4" "year_5"
plotting	hist(one_dim_data) hist(df$column) boxplot(multi_dim_data) boxplot(df)

Data Science Notebook

10 Days of Statistics my solution

Environment: Python2

Day 0: Mean, Median, and Mode

Environment: R

Day 0: Weighted Mean

Day 1: Quartiles

Or

Day 1: Standard Deviation

R odds and ends R basics