# Import pandas
import pandas as pd
# Read in filename and set the index: election
df = pd.read_csv('tweets.csv', index_col='created_at', parse_dates=True)
# Create a separate dataframe with the columns ['entities', 'favorited', 'text']: results
results = df[['entities', 'favorited', 'text']]
# Print the output of results.head()
# print(results.head())
results.head(3)
df.index
df.loc['02-10-17 05:24':'02-10-17 05:24']
df.loc['02-10-17 05:24':'02-10-17 05:24':-2]
df.iloc[1:7:2,4:14:3]
[df.entities]==[df['entities']]
df['text'][2:7]
df.ix[3:7,'entities':'favorited']
df.loc[:,'entities':'favorited'].head()
df.loc['2-10-17 05:20','entities':'favorited']
df.loc['2-10-17 05:20','entities':'favorited']
df.loc['2-10-17 05:24',['entities','text','favorited']]
df.iloc[7:11,11:14]
df.iloc[[3,74,233],[11,14,7]]
df['text'].head()
df[['text']].head()
df['truncated']==1
df[df['truncated']==1].head(3)
df[(df['text'].str.len()>140) & (df['truncated']==1)].head(3)
df.all()
df.any()
df.loc[:, df.isnull().all()]
df.info()
df.shape
df.dropna(how='all').shape
df.dropna(how='any').shape
df.dropna(thresh=20).shape
df.dropna(how='all',axis=1).shape
df.dropna(how='any',axis=1).shape
df.dropna(how='any',axis='columns',thresh=100).shape
import numpy as np
df.entities=np.nan
df.head(3)
df.loc[df['text'].str.len()>144,'text']
(df['text'].str.len()>144).sum()
test = df[['timestamp_ms']].head()
test
test.timestamp_ms.floordiv(7**11)
test['timestamp_ms'].apply(lambda x: x//7**11)
test.timestamp_ms.floordiv(7**11) == test['timestamp_ms'].apply(lambda x: x//7**11)
text = df[['text']].head(10)
text
text['text'] = text['text'].str.upper()
text
text.index
text.index.map(str)
text.index.weekday_name
df[['timestamp_ms','quoted_status_id']].apply(lambda x: x%7).head()
pre = {True: 'truth', False: 'lie'}
df['truncated'].map(pre).head(30)
In statistics, the z-score is the number of standard deviations by which an observation is above the mean - so if it is negative, it means the observation is below the mean.
import pandas as pd
f = pd.read_csv('census.csv',chunksize=10000)
df2=f.next()
df2.info()
df3 =df2[['CENSUS2010POP','POPESTIMATE2015']]
df3.head()
# Import zscore from scipy.stats
from scipy.stats import zscore
import numpy as np
# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(df3['CENSUS2010POP'])
# Print the type of turnout_zscore
print(type(turnout_zscore))
# Assign turnout_zscore to a new column: election['turnout_zscore']
df3['turnout_zscore'] = turnout_zscore
# Print the output of election.head()
print(df3.head())
df3.head()
%%timeit
turnout_zscore = zscore(df3['CENSUS2010POP'])
%%timeit
a = df3['CENSUS2010POP'].std()
%%timeit
b = np.std(df3['CENSUS2010POP'])
df3.sort_values('turnout_zscore',ascending=False)