loading data case using generators and chunks example not using Pandas
loading data case using generators and chunks
example not using Pandas
This is a study note summary of some courses from DataCamp 🙂
In [2]:
import pandas as pd
!dir
In [43]:
f = pd.read_csv('WDI_Data.csv',chunksize=10000)
df = f.next()
df.shape
Out[43]:
In [44]:
print df.columns
df=df.iloc[:,:5].dropna()
In [45]:
df.shape
Out[45]:
In [48]:
df[df['Indicator Code']=='SP.ADO.TFRT']
Out[48]:
In [55]:
content = df[df['Indicator Code']=='SP.ADO.TFRT'].iloc[0,]
In [65]:
row = list(content.values)
row
Out[65]:
In [64]:
names = ['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode', 'Year', 'Value']
Dictionaries for data science¶
In [66]:
# Zip lists: zipped_lists
zipped_lists = zip(names, row)
# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)
# Print the dictionary
print(rs_dict)
Writing a function¶
In [68]:
# Define lists2dict()
def lists2dict(list1, list2):
"""Return a dictionary where list1 provides
the keys and list2 provides the values."""
# Zip lists: zipped_lists
zipped_lists = zip(list1, list2)
# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)
# Return the dictionary
return rs_dict
# Call lists2dict: rs_fxn
rs_fxn = lists2dict(names, row)
# Print rs_fxn
print(rs_fxn)
Using a list comprehension¶
In [81]:
# Print the first two lists in row_lists
print(df.iloc[0,:])
print
print(df.iloc[1,:])
print
# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(names, sublist) for sublist in df.values]
# Print the first two dictionaries in list_of_dicts
print(list_of_dicts[0])
print(list_of_dicts[1])
Turning this all into a DataFrame¶
In [86]:
# Import the pandas package
import pandas as pd
# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(names, sublist) for sublist in df.values]
# Turn list of dicts into a dataframe: df
df2 = pd.DataFrame(list_of_dicts)
print df2.shape
# Print the head of the dataframe
df2.head()
Out[86]:
In [90]:
# Open a connection to the file
with open('WDI_Data.csv') as f:
# Skip the column names
f.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(0, 1000):
# Split the current line into a list: line
line = f.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)
In the previous exercise, you processed a file line by line for a given number of lines. What if, however, we want to to do this for the entire file?¶
In this case, it would be useful to use generators. Generators allow users to lazily evaluate data.¶
- This concept of lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an efficient manner by yielding only chunks of data at a time instead of the whole thing at once.
define a generator function read_large_file() that produces a generator object which yields a single line from a file each time next() is called on it.¶
In [92]:
# Define read_large_file()
def read_large_file(file_object):
"""A generator function to read a large file lazily."""
# Loop indefinitely until the end of the file
while True:
# Read a line from the file: data
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
# Yield the line of data
yield data
# Open a connection to the file
with open('WDI_Data.csv') as file:
# Create a generator object for the file: gen_file
gen_file = read_large_file(file)
# Print the first three lines of the file
print(next(gen_file))
print(next(gen_file))
print(next(gen_file))
- You've just created a generator function that you can use to help you process large files.
- You will process the file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset.
- you'll process the entire dataset!
In [93]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Open a connection to the file
with open('WDI_Data.csv') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
# Print
print(counts_dict)
Writing an iterator to load data in chunks¶
In [ ]:
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('WDI_Data.csv', chunksize=1000)
# Get the first dataframe chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)
# Check out the head of the dataframe
print(df_urb_pop.head())
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['Country Code'] == 'CEB']
# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Print pops_list
print(pops_list)
In [ ]:
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('WDI_Data.csv', chunksize=1000)
# Get the first dataframe chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Use list comprehension to create new dataframe column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
In [ ]:
# Define plot_pop()
def plot_pop(filename, country_code):
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv(filename, chunksize=1000)
# Initialize empty dataframe: data
data = pd.DataFrame()
# Iterate over each dataframe chunk
for df_urb_pop in urb_pop_reader:
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Use list comprehension to create new dataframe column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
# Append dataframe chunk to data: data
data = data.append(df_pop_ceb)
# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
# Set the filename: fn
fn = 'ind_pop_data.csv'
# Call plot_pop for country code 'CEB'
plot_pop(fn, 'CEB')
# Call plot_pop for country code 'ARB'
plot_pop(fn, 'ARB')