# R odds and ends R basics

exam typeclass()type()
vectorc(...)one-dimension arrays
same type
name vectornames(vect)<-c(...)
slicing vectorvect[3]
vect[c(3,5,6)]
starting from 1 compares to 0 in Python
2:5!ERROR! unexpected operator '='includes 5
vect[3:5]!ERROR! illegal character '['
use names as indexvec[c('name1','name2',...)]
calculate averagemean()in Python, have to import other librariesnp.mean()
vectors comparison c(2,3,4,5)>3in Python, have to in numpy, pandas
logical selection vect[c(...)>n]
vect[vect2(logical)]
in Python, pandas is common
matrix matrix()
matrix(1:9, byrow = TRUE, nrow = 3)
two-dimensional
same data type
np.matrix()
Naming a matrixrownames(my_matrix) <- row_names_vector

colnames(my_matrix) <- col_names_vector
dimnames =
list(rowname, columnname)
Sum of values of each rowrowSums(some_matrix)ndarray.sum(axis=1)
df.sum(axis=1)
add column(s) to a matrixbigger<- cbind(matrix1, matrix2, ...)pd.concat([df1,df2],axis=1)
Adding a row(s) to a matrixrbind(matrix1, matrix2, ...)pd.concat([df1,df2],axis=0)
df1.append(df2)
Sum of values of each columnndarray.sum(axis=0)
df.sum(axis=0)
slicing Matrixmatrix[row,col]
my_matrix[1,2]
my_matrix[1:3,2:4]
my_matrix[ ,1]
my_matrix[2, ]
factorsfactor()categorical
Convert vector to factormy_factor<-(vector,
order/ non-ordertemp_vector <- c("High", "Low", "High","Low", "Medium")

factor_temp_vector <- factor(temp_vector, order = TRUE, levels = c("Low", "Medium", "High"))
nominal categorical variable

ordinal categorical variable.
s = pd.Series(["a","b","c","a"], dtype="category")

raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"], .... ordered=False)
Factor levelslevels()
levels(factor_vector) <- c("name1", "name2",...)
summary()summary(my_var)df.describe()
Series.value_counts()
orderedfactor_speed_vector <-factor(speed_vector,ordered=TRUE,levels=c('slow','fast','insane'))
ordered factor can be compared
tail(df)
each column must be same data type
examine structure of a dataframestr(df)
create data framedata.frame(vectors)
slicing df[rows,columns]
df[row2,] entire row2
df[,column3] entire column3
use name slicingdf[2:5, 'name']
df['name', ]
df[ ,'name']
subset()
create
subset(planets_df, diameter<1)
==
planets_df[planets_df[,'diameter']<1,]
sortingorder()
returns ranked index not values
values: a[order(a)]
sorting dfindexes=order(df$column3) df[indexes, ] listmy_list <- list(comp1, comp2 ...) Creating a named listmy_list <- list(name1 = your_comp1, name2 = your_comp2) same as abovemy_list <- list(your_comp1, your_comp2) names(my_list) <- c("name1", "name2") selecting elements from a listshining_list[["reviews"]] == shining_list$reviews
list[[2]][1]
add data to listext_list <- c(my_list , my_val)
comparison& and
| or
! not
double sign only compares the first element
&&
||
if syntax in Rif (condition)
{do sth}
else if (condition)
{do sth}
else
{do sth}
hotdogs2 <- read.delim("hotdogs.txt", header = FALSE, col.names = c("type", "calories", "sodium"), colClasses = c("factor", "NULL", "numeric"))
check environmentenvironment(func)
specify func without a namefunction(x){x+1}(2)

=> 3
mean()mean(c(1:9, NA),trim=0.1,na.rm=TRUE)trim -> remove outliers
environment> f<-function () x
> x<-99
> f()
[1] 99
exists()a<-5
exists("a")
TRUE
vector propertiestypeoff()
length()
nun value in RNULL (absent of entire vector)
NA (absent of one value in vector)
check nunis.na()
sequenceseq(1,10)
1:10
merge vectorc(vector1, vector2, singlevalue, ...)
paste()
paste0()
paste() sep=" "
paste0 sep=""
string.join(list)
paste0("year_", 1:5)[1] "year_1" "year_2" "year_3" "year_4" "year_5"
plottinghist(one_dim_data)
hist(df$column) boxplot(multi_dim_data) boxplot(df) # loading data case using generators and chunks example not using Pandas loading data case using generators and chunks example not using Pandas This is a study note summary of some courses from DataCamp 🙂 bank case ### dataset: World Development Indicators¶ ## World bank data¶ • Data on world economies for over half a century • Indicators • Population • Electricity consumption • CO2 emissions • Literacy rates • Unemployment In [2]: import pandas as pd !dir   Volume in drive D is VM Volume Serial Number is 023B-9433 Directory of D:\Dropbox\datacamp\toolbox 2 02/11/2017 04:05 PM <DIR> . 02/11/2017 04:05 PM <DIR> .. 02/11/2017 03:58 PM <DIR> .ipynb_checkpoints 02/11/2017 04:05 PM 1,098 bank case.ipynb 02/11/2017 02:34 AM 303,160 ch1_slides.pdf 02/11/2017 03:28 PM 421,057 ch2_slides.pdf 02/11/2017 04:03 PM 161,340 ch3_slides.pdf 11/22/2016 11:39 AM 30,959,939 kamcord_data.csv 02/11/2017 03:56 PM 11,670 list.ipynb 02/11/2017 03:24 PM 40,789 tool2.ipynb 02/11/2017 03:21 PM 3,909,340 tweets.csv 02/10/2017 01:17 AM 4,092,402 tweets.txt 10/02/2016 01:30 PM 26,623 university_towns.txt 12/21/2016 02:34 PM 139,169 WDI_Country.csv 12/28/2016 03:29 PM 743,579 WDI_CS_Notes.csv 02/01/2017 03:20 PM 187,559,624 WDI_Data.csv 02/01/2017 03:19 PM 36,410 WDI_Description.csv 12/28/2016 03:28 PM 57,084,594 WDI_Footnotes.csv 12/21/2016 02:34 PM 3,512,261 WDI_Series.csv 12/21/2016 02:34 PM 32,781 WDI_ST_Notes.csv 17 File(s) 289,035,836 bytes 3 Dir(s) 35,920,375,808 bytes free  In [43]: f = pd.read_csv('WDI_Data.csv',chunksize=10000) df = f.next() df.shape  Out[43]: (10000, 61) In [44]: print df.columns df=df.iloc[:,:5].dropna()  Index([u'Country Name', u'Country Code', u'Indicator Name', u'Indicator Code', u'1960', u'1961', u'1962', u'1963', u'1964', u'1965', u'1966', u'1967', u'1968', u'1969', u'1970', u'1971', u'1972', u'1973', u'1974', u'1975', u'1976', u'1977', u'1978', u'1979', u'1980', u'1981', u'1982', u'1983', u'1984', u'1985', u'1986', u'1987', u'1988', u'1989', u'1990', u'1991', u'1992', u'1993', u'1994', u'1995', u'1996', u'1997', u'1998', u'1999', u'2000', u'2001', u'2002', u'2003', u'2004', u'2005', u'2006', u'2007', u'2008', u'2009', u'2010', u'2011', u'2012', u'2013', u'2014', u'2015', u'2016'], dtype='object')  In [45]: df.shape  Out[45]: (725, 5) In [48]: df[df['Indicator Code']=='SP.ADO.TFRT']  Out[48]: Country Name Country Code Indicator Name Indicator Code 1960 48 Arab World ARB Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 133.555013 1500 Caribbean small states CSS Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 162.871212 2952 Central Europe and the Baltics CEB Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 46.716752 4404 Early-demographic dividend EAR Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 116.406607 5856 East Asia & Pacific EAS Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 66.015974 7308 East Asia & Pacific (excluding high income) EAP Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 75.043631 8760 East Asia & Pacific (IDA & IBRD countries) TEA Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 76.409849 In [55]: content = df[df['Indicator Code']=='SP.ADO.TFRT'].iloc[0,]  In [65]: row = list(content.values) row  Out[65]: ['Arab World', 'ARB', 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'SP.ADO.TFRT', 133.55501327768999] In [64]: names = ['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode', 'Year', 'Value']  ### Dictionaries for data science¶ In [66]: # Zip lists: zipped_lists zipped_lists = zip(names, row) # Create a dictionary: rs_dict rs_dict = dict(zipped_lists) # Print the dictionary print(rs_dict)  {'CountryName': 'Arab World', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryCode': 'ARB', 'Year': 133.55501327768999}  ### Writing a function¶ In [68]: # Define lists2dict() def lists2dict(list1, list2): """Return a dictionary where list1 provides the keys and list2 provides the values.""" # Zip lists: zipped_lists zipped_lists = zip(list1, list2) # Create a dictionary: rs_dict rs_dict = dict(zipped_lists) # Return the dictionary return rs_dict # Call lists2dict: rs_fxn rs_fxn = lists2dict(names, row) # Print rs_fxn print(rs_fxn)  {'CountryName': 'Arab World', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryCode': 'ARB', 'Year': 133.55501327768999}  ### Using a list comprehension¶ In [81]: # Print the first two lists in row_lists print(df.iloc[0,:]) print print(df.iloc[1,:]) print # Turn list of lists into list of dicts: list_of_dicts list_of_dicts = [lists2dict(names, sublist) for sublist in df.values] # Print the first two dictionaries in list_of_dicts print(list_of_dicts[0]) print(list_of_dicts[1])  Country Name Arab World Country Code ARB Indicator Name Adolescent fertility rate (births per 1,000 wo... Indicator Code SP.ADO.TFRT 1960 133.555 Name: 48, dtype: object Country Name Arab World Country Code ARB Indicator Name Age dependency ratio (% of working-age populat... Indicator Code SP.POP.DPND 1960 87.7992 Name: 55, dtype: object {'CountryName': 'Arab World', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryCode': 'ARB', 'Year': 133.55501327769} {'CountryName': 'Arab World', 'IndicatorName': 'Age dependency ratio (% of working-age population)', 'IndicatorCode': 'SP.POP.DPND', 'CountryCode': 'ARB', 'Year': 87.79923459912621}  ### Turning this all into a DataFrame¶ In [86]: # Import the pandas package import pandas as pd # Turn list of lists into list of dicts: list_of_dicts list_of_dicts = [lists2dict(names, sublist) for sublist in df.values] # Turn list of dicts into a dataframe: df df2 = pd.DataFrame(list_of_dicts) print df2.shape # Print the head of the dataframe df2.head()  (725, 5)  Out[86]: CountryCode CountryName IndicatorCode IndicatorName Year 0 ARB Arab World SP.ADO.TFRT Adolescent fertility rate (births per 1,000 wo... 133.555013 1 ARB Arab World SP.POP.DPND Age dependency ratio (% of working-age populat... 87.799235 2 ARB Arab World SP.POP.DPND.OL Age dependency ratio, old (% of working-age po... 6.635328 3 ARB Arab World SP.POP.DPND.YG Age dependency ratio, young (% of working-age ... 81.024250 4 ARB Arab World ER.FSH.AQUA.MT Aquaculture production (metric tons) 4600.000000 # Using Python generators for streaming data¶ ## Processing data in chunks¶ ### example not using Pandas¶ • with open(path) as name: do sth In [90]: # Open a connection to the file with open('WDI_Data.csv') as f: # Skip the column names f.readline() # Initialize an empty dictionary: counts_dict counts_dict = {} # Process only the first 1000 rows for j in range(0, 1000): # Split the current line into a list: line line = f.readline().split(',') # Get the value for the first column: first_col first_col = line[0] # If the column value is in the dict, increment its value if first_col in counts_dict.keys(): counts_dict[first_col] += 1 # Else, add to the dict and set value to 1 else: counts_dict[first_col] = 1 # Print the resulting dictionary print(counts_dict)  {'Arab World': 1000}  ### In the previous exercise, you processed a file line by line for a given number of lines. What if, however, we want to to do this for the entire file?¶ ### In this case, it would be useful to use generators. Generators allow users to lazily evaluate data.¶ • This concept of lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an efficient manner by yielding only chunks of data at a time instead of the whole thing at once. ### define a generator function read_large_file() that produces a generator object which yields a single line from a file each time next() is called on it.¶ In [92]: # Define read_large_file() def read_large_file(file_object): """A generator function to read a large file lazily.""" # Loop indefinitely until the end of the file while True: # Read a line from the file: data data = file_object.readline() # Break if this is the end of the file if not data: break # Yield the line of data yield data # Open a connection to the file with open('WDI_Data.csv') as file: # Create a generator object for the file: gen_file gen_file = read_large_file(file) # Print the first three lines of the file print(next(gen_file)) print(next(gen_file)) print(next(gen_file))  Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Arab World,ARB,"2005 PPP conversion factor, GDP (LCU per international$)",PA.NUS.PPP.05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,

Arab World,ARB,"2005 PPP conversion factor, private consumption (LCU per international \$)",PA.NUS.PRVT.PP.05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


• You've just created a generator function that you can use to help you process large files.
• You will process the file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset.
• you'll process the entire dataset!
In [93]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Open a connection to the file
with open('WDI_Data.csv') as file:

# Iterate over the generator from read_large_file()

row = line.split(',')
first_col = row[0]

if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1

# Print
print(counts_dict)

{'Canada': 1452, 'Sao Tome and Principe': 1452, 'Turkmenistan': 1452, 'Lao PDR': 1452, 'Arab World': 1452, 'Lithuania': 1452, 'Cambodia': 1452, 'Switzerland': 1452, 'Ethiopia': 1452, 'Saudi Arabia': 1452, 'OECD members': 1452, 'Swaziland': 1452, 'South Asia': 1452, 'Argentina': 1452, 'Bolivia': 1452, 'Cameroon': 1452, 'Burkina Faso': 1452, 'Bahrain': 1452, 'Middle East & North Africa (IDA & IBRD countries)': 1452, 'Rwanda': 1452, 'South Asia (IDA & IBRD)': 1452, '"Egypt': 1452, 'Japan': 1452, 'Channel Islands': 1452, 'American Samoa': 1452, 'Northern Mariana Islands': 1452, 'Slovenia': 1452, 'East Asia & Pacific (IDA & IBRD countries)': 1452, 'IDA total': 1452, 'Bosnia and Herzegovina': 1452, 'Guinea': 1452, 'Russian Federation': 1452, 'World': 1452, 'St. Lucia': 1452, 'Dominica': 1452, 'Liberia': 1452, 'Maldives': 1452, 'Pakistan': 1452, 'Virgin Islands (U.S.)': 1452, 'Oman': 1452, 'Tanzania': 1452, 'Early-demographic dividend': 1452, 'Cabo Verde': 1452, 'Mauritania': 1452, 'Greenland': 1452, 'Gabon': 1452, 'Monaco': 1452, 'New Zealand': 1452, 'Spain': 1452, 'European Union': 1452, '"Venezuela': 1452, 'Jamaica': 1452, 'Albania': 1452, 'Samoa': 1452, 'Slovak Republic': 1452, 'Kazakhstan': 1452, 'Guam': 1452, 'Uruguay': 1452, 'India': 1452, 'Azerbaijan': 1452, 'Lesotho': 1452, 'Middle East & North Africa': 1452, 'Europe & Central Asia (IDA & IBRD countries)': 1452, 'United Arab Emirates': 1452, 'Latin America & Caribbean': 1452, 'Aruba': 1452, 'Upper middle income': 1452, 'Tajikistan': 1452, 'Pacific island small states': 1452, 'Turkey': 1452, 'Afghanistan': 1452, 'Bangladesh': 1452, 'East Asia & Pacific': 1452, 'Solomon Islands': 1452, 'Turks and Caicos Islands': 1452, 'Palau': 1452, 'San Marino': 1452, 'French Polynesia': 1452, 'France': 1452, 'Syrian Arab Republic': 1452, 'Bermuda': 1452, 'Somalia': 1452, 'Peru': 1452, 'Vanuatu': 1452, 'Nauru': 1452, 'Seychelles': 1452, 'Late-demographic dividend': 1452, "Cote d'Ivoire": 1452, 'West Bank and Gaza': 1452, 'Benin': 1452, 'Other small states': 1452, '"Gambia': 1452, 'Cuba': 1452, 'Montenegro': 1452, 'Low & middle income': 1452, 'Togo': 1452, 'China': 1452, 'Armenia': 1452, 'Jordan': 1452, 'Timor-Leste': 1452, 'Dominican Republic': 1452, '"Hong Kong SAR': 1452, 'Ukraine': 1452, 'Ghana': 1452, 'Tonga': 1452, 'Finland': 1452, 'Colombia': 1452, 'Libya': 1452, 'Cayman Islands': 1452, 'Central African Republic': 1452, 'North America': 1452, 'Liechtenstein': 1452, 'Belarus': 1452, 'British Virgin Islands': 1452, 'Kenya': 1452, 'Sweden': 1452, 'Poland': 1452, 'Bulgaria': 1452, 'Mauritius': 1452, 'Romania': 1452, 'Angola': 1452, 'Central Europe and the Baltics': 1452, 'Chad': 1452, 'South Africa': 1452, 'St. Vincent and the Grenadines': 1452, 'Cyprus': 1452, 'Caribbean small states': 1452, 'Brunei Darussalam': 1452, 'Qatar': 1452, 'Pre-demographic dividend': 1452, 'Middle income': 1452, 'Austria': 1452, 'Vietnam': 1452, 'Mozambique': 1452, 'Uganda': 1452, 'Kyrgyz Republic': 1452, 'Hungary': 1452, 'Niger': 1452, 'Isle of Man': 1452, 'United States': 1452, 'Brazil': 1452, 'Sub-Saharan Africa (IDA & IBRD countries)': 1452, '"Macao SAR': 1452, 'Faroe Islands': 1452, 'Europe & Central Asia (excluding high income)': 1452, 'Panama': 1452, 'Mali': 1452, 'Costa Rica': 1452, 'Luxembourg': 1452, 'St. Kitts and Nevis': 1452, 'Andorra': 1452, 'Norway': 1452, 'Euro area': 1452, 'Gibraltar': 1452, 'Ireland': 1452, 'Italy': 1452, 'Nigeria': 1452, 'Lower middle income': 1452, 'Ecuador': 1452, 'IDA & IBRD total': 1452, 'Australia': 1452, 'Algeria': 1452, 'El Salvador': 1452, 'Tuvalu': 1452, 'IDA only': 1452, 'Guatemala': 1452, 'Czech Republic': 1452, 'Sub-Saharan Africa': 1452, 'Middle East & North Africa (excluding high income)': 1452, 'Chile': 1452, 'Marshall Islands': 1452, 'Belgium': 1452, 'Kiribati': 1452, 'Haiti': 1452, 'Belize': 1452, 'Fragile and conflict affected situations': 1452, 'Sierra Leone': 1452, 'Georgia': 1452, '"Yemen': 1452, 'Denmark': 1452, 'Post-demographic dividend': 1452, 'Puerto Rico': 1452, 'Moldova': 1452, 'Morocco': 1452, 'Croatia': 1452, 'Mongolia': 1452, 'Guinea-Bissau': 1452, 'Thailand': 1452, 'Namibia': 1452, 'Grenada': 1452, 'Latin America & Caribbean (excluding high income)': 1452, 'Iraq': 1452, 'Portugal': 1452, 'Estonia': 1452, 'Kosovo': 1452, 'Mexico': 1452, 'Lebanon': 1452, '"Congo': 2904, 'Uzbekistan': 1452, 'Djibouti': 1452, 'Country Name': 1, 'Antigua and Barbuda': 1452, 'Low income': 1452, 'High income': 1452, 'Burundi': 1452, 'Least developed countries: UN classification': 1452, 'IDA blend': 1452, 'Barbados': 1452, 'Madagascar': 1452, 'Sub-Saharan Africa (excluding high income)': 1452, 'Curacao': 1452, 'Bhutan': 1452, 'Sudan': 1452, 'Nepal': 1452, 'Malta': 1452, '"Micronesia': 1452, 'Netherlands': 1452, '"Bahamas': 1452, '"Macedonia': 1452, 'Kuwait': 1452, 'Europe & Central Asia': 1452, 'United Kingdom': 1452, 'Israel': 1452, 'Indonesia': 1452, 'Malaysia': 1452, 'Iceland': 1452, 'Zambia': 1452, 'Senegal': 1452, 'Papua New Guinea': 1452, 'Malawi': 1452, 'Suriname': 1452, 'Trinidad and Tobago': 1452, 'Zimbabwe': 1452, 'Germany': 1452, 'St. Martin (French part)': 1452, 'East Asia & Pacific (excluding high income)': 1452, 'Philippines': 1452, '"Iran': 1452, 'Eritrea': 1452, 'Small states': 1452, 'New Caledonia': 1452, 'Sri Lanka': 1452, 'Not classified': 1452, 'Latvia': 1452, 'South Sudan': 1452, '"Korea': 2904, 'Guyana': 1452, 'IBRD only': 1452, 'Honduras': 1452, 'Myanmar': 1452, 'Equatorial Guinea': 1452, 'Tunisia': 1452, 'Nicaragua': 1452, 'Singapore': 1452, 'Serbia': 1452, 'Comoros': 1452, 'Latin America & the Caribbean (IDA & IBRD countries)': 1452, 'Sint Maarten (Dutch part)': 1452, 'Greece': 1452, 'Paraguay': 1452, 'Fiji': 1452, 'Botswana': 1452, 'Heavily indebted poor countries (HIPC)': 1452}


### Writing an iterator to load data in chunks¶

In [ ]:
# Initialize reader object: urb_pop_reader

# Get the first dataframe chunk: df_urb_pop

# Check out the head of the dataframe

# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['Country Code'] == 'CEB']

# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])

# Turn zip object into list: pops_list
pops_list = list(pops)

# Print pops_list
print(pops_list)

In [ ]:
# Initialize reader object: urb_pop_reader

# Get the first dataframe chunk: df_urb_pop

# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])

# Turn zip object into list: pops_list
pops_list = list(pops)

# Use list comprehension to create new dataframe column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]

# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

In [ ]:
# Define plot_pop()
def plot_pop(filename, country_code):

# Initialize empty dataframe: data
data = pd.DataFrame()

# Iterate over each dataframe chunk
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]

# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])

# Turn zip object into list: pops_list
pops_list = list(pops)

# Use list comprehension to create new dataframe column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]

# Append dataframe chunk to data: data
data = data.append(df_pop_ceb)

# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

# Set the filename: fn
fn = 'ind_pop_data.csv'

# Call plot_pop for country code 'CEB'
plot_pop(fn, 'CEB')

# Call plot_pop for country code 'ARB'
plot_pop(fn, 'ARB')


# list comprehension and generators

list comprehension and generators

list

# list comprehensions and generators¶

### Nested list comprehensions¶

• [[output expression] for iterator variable in iterable]
• Collapse for loops for building lists into a single line
• Components
• Iterable
• Iterator variable (represent members of iterable)
• Output expression
In [1]:
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(5)] for row in range(5)]

# Print the matrix
for row in matrix:
print(row)

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]

In [7]:
pair_2=[(num1, num2) for num1 in range(0, 2) for num2 in range(6, 8)]
pair_2

Out[7]:
[(0, 6), (0, 7), (1, 6), (1, 7)]

### Using conditionals in comprehensions¶

• [ output expression for iterator variable in iterable if predicate expression ].
In [2]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member for member in fellowship if len(member) >= 7]

# Print the new list
print(new_fellowship)

['samwise', 'aragorn', 'legolas', 'boromir']

In [3]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member if len(member) >= 7 else '' for member in fellowship]

# Print the new list
print(new_fellowship)

['', 'samwise', '', 'aragorn', 'legolas', 'boromir', '']


## Dict comprehensions¶

• Recall that the main difference between a list comprehension and a dict comprehension is the use of curly braces {} instead of []. Additionally, members of the dictionary are created using a colon :, as in key:value
• Create dictionaries
• Use curly braces {} instead of brackets []
In [4]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create dict comprehension: new_fellowship
new_fellowship = {member:len(member) for member in fellowship}

# Print the new list
print(new_fellowship)

{'aragorn': 7, 'frodo': 5, 'samwise': 7, 'merry': 5, 'gimli': 5, 'boromir': 7, 'legolas': 7}


# Generator expressions¶

• Recall list comprehension
• Use ( ) instead of [ ]
In [9]:
g = (2 * num for num in range(10))
g

Out[9]:
<generator object <genexpr> at 0x0000000004335A20>

## List comprehensions vs. generators¶

• List comprehension - returns a list
• Generators - returns a generator object
• Both can be iterated over
In [13]:
(num for num in range(10*1000000) if num % 2 == 0)

Out[13]:
<generator object <genexpr> at 0x0000000004335E10>

## Generator functions¶

#### Generator functions are functions that, like generator expressions, yield a series of values, instead of returning a single value. A generator function is defined as you do a regular function, but whenever it generates a value, it uses the keyword yield instead of return.¶

• Produces generator objects when called
• Defined like a regular function - def
• Yields a sequence of values instead of returning a single value
• Generates a value with yield keyword
In [15]:
def num_sequence(n):

"""Generate values from 0 to n."""
i = 0
while i < n:
yield i
i += 1

In [17]:
test=num_sequence(7)
print type(test)

<type 'generator'>

In [21]:
next(test)

Out[21]:
3
In [22]:
test.next()

Out[22]:
4

## List comprehensions for time-stamped data¶

### the pandas Series¶

• single-dimension arrays
• Extract the column 'created_at' from df and assign the result to tweet_time. Fun fact: the extracted column in tweet_time here is a Series data structure!
• reate a list comprehension that extracts the time from each row in tweet_time. Each row is a string that represents a timestamp, and you will access the 11th to 18th characters in the string to extract the time. Use entry as the iterator variable and assign the result to tweet_clock_time.
In [27]:
import pandas as pd

# Extract the created_at column from df: tweet_time
tweet_time = df['created_at']

# Extract the clock time: tweet_clock_time
tweet_clock_time = [entry[11:19] for entry in tweet_time]

# Print the extracted times
print(tweet_clock_time[:100])

['05:24:51', '05:24:57', '05:25:38', '05:25:42', '05:25:48', '05:25:53', '05:25:58', '05:26:12', '05:26:27', '05:26:30', '05:26:35', '05:26:48', '05:27:56', '05:28:28', '05:28:28', '05:28:40', '05:28:55', '05:30:06', '05:30:18', '05:30:20', '05:30:53', '05:30:55', '05:31:41', '05:32:20', '05:32:23', '05:32:32', '05:34:11', '05:34:17', '05:36:07', '05:38:17', '05:38:26', '05:39:39', '05:39:48', '05:40:07', '05:40:19', '05:40:58', '05:41:06', '05:41:21', '05:41:34', '05:41:51', '05:42:13', '05:42:51', '05:43:20', '05:43:24', '05:43:34', '05:44:36', '05:45:16', '05:45:40', '05:46:38', '05:46:40', '05:46:56', '05:47:07', '05:47:36', '05:47:44', '05:47:50', '05:48:01', '05:48:19', '05:49:10', '05:49:31', '05:49:36', '05:49:39', '05:49:39', '05:49:48', '05:49:52', '05:49:54', '05:50:04', '05:50:07', '05:50:16', '05:50:21', '05:50:35', '05:50:46', '05:50:49', '05:50:49', '05:50:56', '05:51:15', '05:51:26', '05:51:28', '05:51:43', '05:52:27', '05:52:32', '05:52:35', '05:52:45', '05:53:00', '05:53:33', '05:53:37', '05:53:55', '05:53:59', '05:54:14', '05:54:26', '05:54:55', '05:54:59', '05:55:25', '05:55:31', '05:55:39', '05:55:53', '05:55:57', '05:56:02', '05:56:14', '05:56:17', '05:56:29']


### Conditional list comprehesions for time-stamped data¶

• add a conditional expression to the list comprehension so that you only select the times in which entry[17:19] is equal to '19'
In [28]:
# Extract the created_at column from df: tweet_time
tweet_time = df['created_at']

# Extract the clock time: tweet_clock_time
tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']

# Print the extracted times
print(tweet_clock_time)

['05:40:19', '05:48:19', '06:02:19', '06:03:19', '04:56:19', '05:40:19', '05:48:19', '06:02:19', '06:03:19', '03:31:19', '03:54:19', '04:23:19']

In [ ]:



Python iterators

tool2

# Iterators, load file in chunks¶

## Iterators vs Iterables¶

### an iterable is an object that can return an iterator¶

• Examples: lists, strings, dictionaries, file connections
• An object with an associated iter() method
• Applying iter() to an iterable creates an iterator

### an iterator is an object that keeps state and produces the next value when you call next() on it.¶

• Produces next value with next()
In [30]:
a=[1,2,3,4]
b=iter([1,2,3,4])
c=iter([5,6,7,8])

In [31]:
print a
print b
print next(b),next(b),next(b),next(b)
print list(c)

[1, 2, 3, 4]
<listiterator object at 0x00000000044B5A90>
1 2 3 4
[5, 6, 7, 8]


### Iterating over iterables¶

• Python 2 does NOT work
• range() doesn't actually create the list; instead, it creates a range object with an iterator that produces the values until it reaches the limit

• If range() created the actual list, calling it with a value of 10^100 may not work, especially since a number as big as that may go over a regular computer's memory. The value 10^100 is actually what's called a Googol which is a 1 followed by a hundred 0s. That's a huge number!

• calling range() with 10^100 won't actually pre-create the list.
In [15]:
# Create an iterator for range(10 ** 100): googol
googol = iter(range(10 ** 100))

---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
<ipython-input-15-57ef632b6db1> in <module>()
1 # Create an iterator for range(10 ** 100): googol
----> 2 googol = iter(range(10 ** 100))
3

OverflowError: range() result has too many items

### Iterating over dictionaries¶

In [32]:
a={1:9, 'what':'why?'}

In [34]:
for key,value in a.items(): print key,value

1 9
what why?


### Iterating over file connections¶

In [40]:
f = open('university_towns.txt')
type(f)

Out[40]:
file
In [47]:
iter(f)

Out[47]:
<open file 'university_towns.txt', mode 'r' at 0x00000000041F9F60>
In [48]:
iter(f)==f

Out[48]:
True
In [49]:
next(f)

Out[49]:
'Florence (University of North Alabama)\n'
In [50]:
next(iter(f))

Out[50]:
'Jacksonville (Jacksonville State University)[2]\n'

## Using enumerate¶

### enumerate() returns an enumerate object that produces a sequence of tuples, and each of the tuples is an index-value pair.¶

In [65]:
# Create a list of strings: mutants
mutants = ['charles xavier',  'bobby drake', 'kurt wagner',  'max eisenhardt',  'kitty pride']
# Create a list of tuples: mutant_list
mutant_list = list(enumerate(mutants))
# Print the list of tuples
print(mutant_list)
print
# Unpack and print the tuple pairs
for index1, value1 in enumerate(mutants):
print(index1, value1)

print "\nChange the start index\n"
for index2, value2 in enumerate(mutants, start=3):
print(index2, value2)

[(0, 'charles xavier'), (1, 'bobby drake'), (2, 'kurt wagner'), (3, 'max eisenhardt'), (4, 'kitty pride')]

(0, 'charles xavier')
(1, 'bobby drake')
(2, 'kurt wagner')
(3, 'max eisenhardt')
(4, 'kitty pride')

Change the start index

(3, 'charles xavier')
(4, 'bobby drake')
(5, 'kurt wagner')
(6, 'max eisenhardt')
(7, 'kitty pride')


## Using zip¶

### zip(), which takes any number of iterables and returns a zip object that is an iterator of tuples.¶

• If you wanted to print the values of a zip object, you can convert it into a list and then print it.
• Printing just a zip object will not return the values unless you unpack it first.

### In Python 2 , zip() returns a list¶

• Docstring: zip(seq1 [, seq2 [...]]) -> [(seq1[0], seq2[0] ...), (...)]

• Return a list of tuples, where each tuple contains the i-th element from each of the argument sequences. The returned list is truncated in length to the length of the shortest argument sequence.

In [73]:
aliases = ['prof x', 'iceman', 'nightcrawler', 'magneto', 'shadowcat']
powers = ['telepathy','thermokinesis','teleportation','magnetokinesis','intangibility']

# Create a list of tuples: mutant_data
mutant_data = list(zip(mutants, aliases, powers))

# Print the list of tuples
print(mutant_data)

print
# Create a zip object using the three lists: mutant_zip
mutant_zip = zip(mutants, aliases, powers)

# Print the zip object
print(type(mutant_zip))

# Unpack the zip object and print the tuple values
for value1, value2, value3 in mutant_zip:
print(value1, value2, value3)

[('charles xavier', 'prof x', 'telepathy'), ('bobby drake', 'iceman', 'thermokinesis'), ('kurt wagner', 'nightcrawler', 'teleportation'), ('max eisenhardt', 'magneto', 'magnetokinesis'), ('kitty pride', 'shadowcat', 'intangibility')]

<type 'list'>
('charles xavier', 'prof x', 'telepathy')
('bobby drake', 'iceman', 'thermokinesis')
('kurt wagner', 'nightcrawler', 'teleportation')
('max eisenhardt', 'magneto', 'magnetokinesis')


• There can be too much data to hold in memory
• Solution: load data in chunks!
• Specify the chunk: chunksize
In [78]:
import pandas as pd
from time import time

In [110]:
start = time()

print 'used {:.2f} s'.format(time()-start)

print df.shape

used 0.40 s
(357404, 6)

Out[110]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 0 5be921e9-e2bd-47ac-b1c8-94f7289324a2 APP_CLOSED 2016-09-18 03:06:25 android 2.5.1

### explore¶

In [ ]:
a=pd.read_csv('kamcord_data.csv',chunksize=4)

In [139]:
a.next()

Out[139]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 12 d078c3a4-9a80-4b12-9ca7-95873799f4be APP_CLOSED 2016-09-18 14:11:29 ios 6.4.1
1 13 a1ac31cb-6d06-401a-a33f-66f91abf1550 APP_CLOSED 2016-09-27 16:22:06 ios 6.4.1
2 14 48a70e65-205e-4ab9-9232-3bafa6fb9496 APP_CLOSED 2016-09-19 14:45:08 android 2.5.1
3 15 e8330f1a-eac6-4add-89a1-f3545b8189e7 SHOT_RECORDED 2016-09-18 12:52:17 android 2.5.1
In [140]:
x=a.next()
y=a.next()

In [143]:
y.append(x, ignore_index=True)

Out[143]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 20 d04b2d7a-d847-4790-b8ec-a975e7ba56a4 APP_CLOSED 2016-09-19 04:23:31 android 2.5.1
1 21 8dc251b8-03b6-4671-8780-389cd3bc3004 APP_CLOSED 2016-09-11 14:01:04 ios 6.4.1
2 22 e97f8a1a-bdcd-4d38-ac73-63d2b0105395 APP_CLOSED 2016-09-16 19:08:45 android 2.5.1
3 23 beb48c53-d807-4e1a-b1b6-cc20eebf679c SHOT_RECORDED 2016-09-11 06:30:35 android 2.5.1
4 16 e1f9a1cd-605d-4b94-9dfb-a011f9ec2e0d APP_OPEN 2016-09-25 21:17:22 ios 6.4.1
5 17 95b9becf-fa38-4c4a-b265-8bf2594b911a APP_OPEN 2016-09-24 16:58:35 android 2.6
6 18 19836371-f0f0-4db0-b027-a3fa2d0dbf35 SHOT_RECORDED 2016-09-23 12:15:03 ios 6.4.1
7 19 c39eeee3-6605-4970-95b8-0ddb21c81589 SHOT_RECORDED 2016-09-24 04:26:03 android 2.6
In [144]:
pd.concat([x,y], ignore_index=True)

Out[144]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 16 e1f9a1cd-605d-4b94-9dfb-a011f9ec2e0d APP_OPEN 2016-09-25 21:17:22 ios 6.4.1
1 17 95b9becf-fa38-4c4a-b265-8bf2594b911a APP_OPEN 2016-09-24 16:58:35 android 2.6
2 18 19836371-f0f0-4db0-b027-a3fa2d0dbf35 SHOT_RECORDED 2016-09-23 12:15:03 ios 6.4.1
3 19 c39eeee3-6605-4970-95b8-0ddb21c81589 SHOT_RECORDED 2016-09-24 04:26:03 android 2.6
4 20 d04b2d7a-d847-4790-b8ec-a975e7ba56a4 APP_CLOSED 2016-09-19 04:23:31 android 2.5.1
5 21 8dc251b8-03b6-4671-8780-389cd3bc3004 APP_CLOSED 2016-09-11 14:01:04 ios 6.4.1
6 22 e97f8a1a-bdcd-4d38-ac73-63d2b0105395 APP_CLOSED 2016-09-16 19:08:45 android 2.5.1
7 23 beb48c53-d807-4e1a-b1b6-cc20eebf679c SHOT_RECORDED 2016-09-11 06:30:35 android 2.5.1

In [146]:
start = time()

c=0
if c==0:
df=chuck
c+=1
else:
df=df.append(chuck, ignore_index=True)
c+=1
print c

print 'used {:.2f} s'.format(time()-start)

print df.shape

8
used 0.48 s
(357404, 6)

Out[146]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 0 5be921e9-e2bd-47ac-b1c8-94f7289324a2 APP_CLOSED 2016-09-18 03:06:25 android 2.5.1

In [149]:
start = time()

want=[]

want.append(chuck)

print len(want)

df=pd.concat(want, ignore_index=True)

print 'used {:.2f} s'.format(time()-start)

print df.shape

8
used 0.43 s
(357404, 6)

Out[149]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 0 5be921e9-e2bd-47ac-b1c8-94f7289324a2 APP_CLOSED 2016-09-18 03:06:25 android 2.5.1

In [172]:
start = time()

want=[]

go = True
while go:
try:
want.append(f.get_chunk(50000))
except Exception as e:
print type(e)
go = False

print len(want)

df=pd.concat(want, ignore_index=True)

print 'used {:.2f} s'.format(time()-start)

print df.shape

<type 'exceptions.StopIteration'>
8
used 0.43 s
(357404, 6)

Out[172]:
Unnamed: 0 user_id event_name event_time os_name app_version
0 0 5be921e9-e2bd-47ac-b1c8-94f7289324a2 APP_CLOSED 2016-09-18 03:06:25 android 2.5.1

## Processing large amounts of Twitter data by chunks¶

In [183]:
import pandas as pd

# Import package
import json

# Initialize empty list to store tweets: tweets_data
tweets_data = []

# Open connection to file
h=open('tweets.txt','r')

# Read in tweets and store in list: tweets_data
for i in h:
try:
print 'O',
tweets_data.append(tmp)
except:
print 'X',
h.close()

t_df = pd.DataFrame(tweets_data)
print
print t_df.shape

t_df.to_csv('tweets.csv',index=False, encoding= 'utf-8')

O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O X O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
(615, 33)

Out[183]:
contributors coordinates created_at display_text_range entities extended_entities extended_tweet favorite_count favorited filter_level ... quoted_status_id quoted_status_id_str retweet_count retweeted retweeted_status source text timestamp_ms truncated user
0 None None Fri Feb 10 05:24:51 +0000 2017 NaN {u'user_mentions': [{u'indices': [3, 16], u'sc... NaN NaN 0 False low ... 8.299017e+17 829901721182670849 0 False {u'contributors': None, u'truncated': False, u... <a href="http://twitter.com/#!/download/ipad" ... RT @MeckeringBoy: Does Pauline secretly wear h... 1486704291616 False {u'follow_request_sent': None, u'profile_use_b...

1 rows × 33 columns

## Processing large amounts of data by chunks¶

In [184]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Iterate over the file chunk by chunk

# Iterate over the column in dataframe
for entry in chunk['lang']:
if entry in counts_dict.keys():
counts_dict[entry] += 1
else:
counts_dict[entry] = 1

# Print the populated dictionary
print(counts_dict)

{'fr': 1, 'en': 597, 'und': 14, 'sv': 2, 'es': 1}


## Extracting information for large amounts of Twitter data¶

• reusable
• def func
In [185]:
# Define count_entries()
def count_entries(csv_file, c_size, colname):
"""Return a dictionary with counts of
occurrences as value for each key."""

# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Iterate over the file chunk by chunk

# Iterate over the column in dataframe
for entry in chunk[colname]:
if entry in counts_dict.keys():
counts_dict[entry] += 1
else:
counts_dict[entry] = 1

# Return counts_dict
return counts_dict

# Call count_entries(): result_counts
result_counts = count_entries('tweets.csv', 10, 'lang')

# Print result_counts
print(result_counts)

{'fr': 1, 'en': 597, 'und': 14, 'sv': 2, 'es': 1}

In [ ]:



# Python odds and ends scope, filter, reduce

 description code comments quickly assign values a,b,c = (3,7,12) unpack nested functions outer func return inner func
tool1

### Python's built-in scope¶

• check out Python's built-in scope, which is really just a built-in module called builtins
• to query builtins, you'll need to import builtins
In [16]:
import builtins
print dir(builtins)

['ArithmeticError', 'AssertionError', 'AttributeError', 'BaseException', 'BufferError', 'BytesWarning', 'DeprecationWarning', 'EOFError', 'Ellipsis', 'EnvironmentError', 'Exception', 'False', 'FloatingPointError', 'FutureWarning', 'GeneratorExit', 'IOError', 'ImportError', 'ImportWarning', 'IndentationError', 'IndexError', 'KeyError', 'KeyboardInterrupt', 'LookupError', 'MemoryError', 'NameError', 'None', 'NotImplemented', 'NotImplementedError', 'OSError', 'OverflowError', 'PendingDeprecationWarning', 'ReferenceError', 'RuntimeError', 'RuntimeWarning', 'StandardError', 'StopIteration', 'SyntaxError', 'SyntaxWarning', 'SystemError', 'SystemExit', 'TabError', 'True', 'TypeError', 'UnboundLocalError', 'UnicodeDecodeError', 'UnicodeEncodeError', 'UnicodeError', 'UnicodeTranslateError', 'UnicodeWarning', 'UserWarning', 'ValueError', 'Warning', 'WindowsError', 'ZeroDivisionError', '__builtins__', '__doc__', '__file__', '__future_module__', '__name__', '__package__', '__path__', 'abs', 'absolute_import', 'all', 'any', 'apply', 'ascii', 'basestring', 'bin', 'bool', 'buffer', 'bytearray', 'bytes', 'callable', 'chr', 'classmethod', 'cmp', 'coerce', 'compile', 'complex', 'copyright', 'credits', 'delattr', 'dict', 'dir', 'divmod', 'dreload', 'enumerate', 'eval', 'execfile', 'file', 'filter', 'float', 'format', 'frozenset', 'get_ipython', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex', 'id', 'input', 'int', 'intern', 'isinstance', 'issubclass', 'iter', 'len', 'license', 'list', 'locals', 'long', 'map', 'max', 'memoryview', 'min', 'next', 'object', 'oct', 'open', 'ord', 'pow', 'print', 'property', 'range', 'raw_input', 'reduce', 'reload', 'repr', 'reversed', 'round', 'set', 'setattr', 'slice', 'sorted', 'staticmethod', 'str', 'sum', 'super', 'sys', 'tuple', 'type', 'unichr', 'unicode', 'vars', 'xrange', 'zip']


## nested functions¶

• return innter function
In [22]:
def raise_val(n):

"""Return the inner function."""
def inner(x):
"""Raise x to the power of n."""
raised = x ** n
return raised

return inner

In [25]:
square = raise_val(2)
cube =  raise_val(3)

print square(6), cube(6)

36 216


### pass parameters¶

In [30]:
raise_val(4)(3)

Out[30]:
81

## scope searched¶

• Local scope
• Enclosing functions
• Global
• Built-in
In [44]:
n=3

def outer():
"""Prints the value of n."""
n = 1
def inner():
n = 2
print(n)

inner()
print(n)

In [45]:
outer()

2
1


### nested func¶

• nesting functions is the idea of a closure
• This means that the nested or inner function remembers the state of its enclosing scope when called
• Thus, anything defined locally in the enclosing scope is available to the inner function even when the outer function has finished execution
In [46]:
# Define echo
def echo(n):
"""Return the inner_echo function."""

# Define inner_echo
def inner_echo(word1):
"""Concatenate n copies of word1."""
echo_word = word1 * n
return echo_word

# Return inner_echo
return inner_echo

# Call echo: twice
twice = echo(2)

# Call echo: thrice
thrice = echo(3)

# Call twice() and thrice() then print
print(twice('hello'), thrice('hello'))

('hellohello', 'hellohellohello')

In [48]:
echo(7)('wtf ')

Out[48]:
'wtf wtf wtf wtf wtf wtf wtf '

## flexible arguments¶

### Function with variable-length arguments (*args)¶

In [51]:
# Define gibberish
def gibberish(*notmatter):
"""Concatenate strings in *args together."""

# Initialize an empty string: hodgepodge
hodgepodge = ''

# Concatenate the strings in args
for word in notmatter:
hodgepodge += word+ ' '

# Return hodgepodge
return hodgepodge

# Call gibberish() with one string: one_word
one_word = gibberish("luke")

# Call gibberish() with five strings: many_words
many_words = gibberish("luke", "leia", "han", "obi", "darth")

# Print one_word and many_words
print(one_word)
print(many_words)

luke
luke leia han obi darth


### Function with variable-length keyword arguments (**kwargs)¶

In [55]:
# Define report_status
def report_status(**whatevername):
"""Print out the status of a movie character."""

print("\nBEGIN: REPORT\n")

print whatevername
print
# Print a formatted status report
for key, value in whatevername.items():
print(key + ": " + value)

print("\nEND REPORT")

# First call to report_status()
report_status(name="luke", affiliation="jedi", status="missing")

# Second call to report_status()
report_status(name="anakin", affiliation="sith lord", status="deceased")

BEGIN: REPORT

{'status': 'missing', 'affiliation': 'jedi', 'name': 'luke'}

status: missing
affiliation: jedi
name: luke

END REPORT

BEGIN: REPORT

{'status': 'deceased', 'affiliation': 'sith lord', 'name': 'anakin'}

status: deceased
affiliation: sith lord
name: anakin

END REPORT


### Map() and lambda functions¶

In [56]:
# Create a list of strings: spells
spells = ['protego', 'accio', 'expecto patronum', 'legilimens']

# Use map() to apply a lambda function over spells: shout_spells
shout_spells = map(lambda item: item + '!!!', spells)

# Convert shout_spells to a list: shout_spells_list
shout_spells_list = list(shout_spells)

# Convert shout_spells into a list and print it
print(shout_spells_list)

['protego!!!', 'accio!!!', 'expecto patronum!!!', 'legilimens!!!']


## Filter() and lambda functions¶

### The function filter() offers a way to filter out elements from a list that doesn't satisfy certain criteria.¶

• filter(function or None, sequence) -> list, tuple, or string

• Return those items of sequence for which function(item) is true. If function is None, return the items that are true. If sequence is a tuple or string, return the same type, else return a list.

In [59]:
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Use filter() to apply a lambda function over fellowship: result
result = filter(lambda member: len(member) > 6, fellowship)

# Convert result to a list: result_list
result_list = list(result)

# Convert result into a list and print it
print(result_list)

['samwise', 'aragorn', 'legolas', 'boromir']

In [69]:
filter(lambda member: len(member) >3, ['1234','234','34567'])

Out[69]:
['1234', '34567']
In [83]:
filter(None, [12>1, 'wtf' if 2>1 else 0, 'aiya' if 3>2 else 7, 'momomo' if 4>5 else -44])

Out[83]:
[True, 'wtf', 'aiya', -44]

## Reduce() and lambda functions¶

### To use reduce(), you must import it from the functools module.¶

• reduce(function, sequence[, initial]) -> value
• Apply a function of two arguments cumulatively to the items of a sequence, from left to right, so as to reduce the sequence to a single value.
• For example, reduce(lambda x, y: x+y, [1, 2, 3, 4, 5]) calculates ((((1+2)+3)+4)+5).
• If initial is present, it is placed before the items of the sequence in the calculation, and serves as a default when the sequence is empty.
In [86]:
# Import reduce from functools
from functools import reduce

# Create a list of strings: stark
stark = ['robb', 'sansa', 'arya', 'eddard', 'jon']

# Use result() to apply a lambda function over stark: result
result = reduce(lambda item1, item2: item1 +' '+ item2, stark)

# Print the result
print(result)

robb sansa arya eddard jon


### error handling¶

• raise error
In [88]:
try: '3'+3
except Exception, e: print e

cannot concatenate 'str' and 'int' objects

In [91]:
# Define shout_echo
def shout_echo(word1, echo=1):
"""Concatenate echo copies of word1 and three
exclamation marks at the end of the string."""

# Raise an error with raise
if echo < 0:
raise ValueError('echo must be greater than 0')

# Concatenate echo copies of word1 using *: echo_word
echo_word = word1 * echo

# Concatenate '!!!' to echo_word: shout_word
shout_word = echo_word + '!!!'

# Return shout_word
return shout_word

# Call shout_echo
try:
shout_echo("particle", echo=-3)
except Exception, e:
print e

echo must be greater than 0

In [96]:
shout_echo("123", echo=-1)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-96-44c64dc8c830> in <module>()
----> 1 shout_echo("123", echo=-1)

<ipython-input-91-d6d66ed4753d> in shout_echo(word1, echo)
6     # Raise an error with raise
7     if echo < 0:
----> 8         raise ValueError('echo must be greater than 0')
9
10     # Concatenate echo copies of word1 using *: echo_word

ValueError: echo must be greater than 0

### use filter(lambda: x ...) in Pandas¶

In [98]:
# Select retweets from the Twitter dataframe: result

result = filter(lambda x: x[0:2] == 'RT', df['text'])

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-98-ac4d1cb6d465> in <module>()
1 # Select retweets from the Twitter dataframe: result
2
----> 3 result = filter(lambda x: x[0:2] == 'RT', df['text'])

NameError: name 'df' is not defined
In [ ]:



# database python connection basic Sqlalchemy & Pandas

connecting a database through Python

Sqlite

Sqlalchemy

Pandas

basic