connecting to a PostgreSQL database. When connecting to a PostgreSQL database
many prefer to use the psycopg2 database driver as it supports practically all of PostgreSQL's features efficiently and is the standard dialect for PostgreSQL in SQLAlchemy.
There are three components to the connection string in this exercise: the dialect and driver ('postgresql+psycopg2://')
followed by the username and password ('student:datacamp')
followed by the host and port ('@postgresql.csrrinzqubik.us-east-1.rds.amazonaws.com:5432/')
and finally, the database name ('census').
from sqlalchemy import create_engine
import pandas as pd
# Import create_engine function
from sqlalchemy import create_engine
# Create an engine to the census database
engine = create_engine('postgresql+psycopg2://postgres:iri@localhost')
connection = engine.connect()
# create table & insert data from Pandas df to PostgreSQL
df = pd.read_csv('all_050_in_36.P1.csv')
# df.to_sql('census',con)
# Use the .table_names() method on the engine to print the table names
print(engine.table_names())
r = connection.execute('select * from census')
r.fetchall()[:4]
● Provide more complex conditions than simple operators
● Eg. in_(), like(), between()
● Many more in documentation
● Available as method on a Column
from sqlalchemy import MetaData, Table, select, or_, and_, not_
metadata = MetaData()
census = Table('census', metadata, autoload=True, autoload_with=engine)
print census.columns.keys()
stmt = select([census])
print stmt
stmt = stmt.where(census.columns.NAME.startswith('A'))
print stmt
results = connection.execute(stmt).fetchall()
results
# Loop over the results and print the age, sex, and pop2008
for result in results:
print(result.STATE, result.COUNTY, result.NAME, result["POP100.2000"], result.POP100)
State (string)
Agency type (string)
Agency name (string)
Race (signed int64)
Religion (signed int64)
Sexual orientation (signed int64)
Ethnicity (signed int64)
Disability (signed int64)
Gender (signed int64)
Gender Identity (signed int64)
1st quarter (signed int64)
2nd quarter (signed int64)
3rd quarter (signed int64)
4th quarter (signed int64)
Population (string)
from sqlalchemy import create_engine
import pandas as pd
# Create an engine to the census database
engine = create_engine('postgresql+psycopg2://postgres:iri@localhost')
connection = engine.connect()
# create table & insert data from Pandas df to PostgreSQL
df = pd.read_csv('table13.csv')
df[['Gender','Population']] = df[['Gender','Population']].apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce'))
# df.to_sql('crime2',connection)
# Use the .table_names() method on the engine to print the table names
print(engine.table_names())
df.head(1)
df.dtypes
df = pd.read_csv('table13.csv')
df.dtypes
pd.to_numeric(df['Population'].str.replace(',','')).head()
df[['Gender','Population']].apply(lambda x: pd.to_numeric(x.str.replace(',',''), errors='coerce')).head()
df['Agency type'].value_counts()
df['State'].unique()
In addition to standard Python comparators, we can also use methods such as in_() to create more powerful where() clauses.
You can see a full list of expressions in the SQLAlchemy Documentation.
http://docs.sqlalchemy.org/en/latest/core/sqlelement.html#module-sqlalchemy.sql.expression
states = ['Connecticut', 'Virginia']
from sqlalchemy import select, or_, and_, not_
from sqlalchemy import create_engine, MetaData, Table
# Create an engine to the census database
engine = create_engine('postgresql+psycopg2://postgres:iri@localhost')
connection = engine.connect()
metadata = MetaData()
crime = Table('crime2', metadata, autoload=True, autoload_with=engine)
# Create a query for the census table: stmt
# Append a where clause to match all the states in_ the list states
stmt = select([crime]).where(crime.c.State.in_(states))
print stmt
# Loop over the ResultProxy and print the state and its population in 2000
c = 0
for result in connection.execute(stmt):
print(result.State, result.Population, result['Agency type'],result['Agency name'] ),
c+=1
print c
if c==10: break
● Allows us to control the order in which records are returned in the query results
● Available as a method on statements order_by()
Order by Descending
● Just separate multiple columns with a comma
● Orders completely by the first column
● Then if there are duplicates in the first column, orders by the second column
● repeat until all columns are ordered
print crime.columns.keys()
crime.columns['Agency name']
from sqlalchemy import desc
statement = select([crime]).where(
and_(crime.c.State == 'New_York',
or_(crime.columns['Agency type'] == 'Cities',
crime.columns.Race == 2,
crime.c.Gender == 3
)
)
).order_by(desc(crime.c.Population), crime.c['Agency name'])
print statement
connection.execute(statement).fetchall()[:10]
connection.execute('select * from crime2').first()
print connection.execute('select sum(crime2."Population") from crime2').fetchall()
print connection.execute('select sum(crime2."Population") from crime2').scalar()
● E.g. Count, Sum
● from sqlalchemy import func
● More efficient than processing in Python
● Aggregate data
from sqlalchemy import func
# Build a query to count the distinct states values: stmt
stmt = select([func.count(crime.columns.State.distinct())])
print stmt
# Execute the query and store the scalar result: distinct_state_count
distinct_state_count = connection.execute(stmt).scalar()
# Print the distinct_state_count
print(distinct_state_count)
print connection.execute(stmt).fetchall()
connection.execute(select([func.sum(crime.c.Population)])).scalar()
● Allows us to group row by common values.
# Build a query to select the state and count of records by state: stmt
stmt = select([crime.columns.State, func.count(crime.columns.Population)])
# Group stmt by state
stmt = stmt.group_by(crime.columns.State)
print stmt
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
print
# Print results
print(results)
print
# Print the keys/column names of the results returned
print(results[0].keys())
# Build an expression to calculate the sum of labeled as population
pop = func.sum(crime.columns.Population).label('population')
print pop
# Build a query to select the state and sum of pop2008: stmt
stmt = select([crime.columns.State, pop])
# Group stmt by state
stmt = stmt.group_by(crime.columns.State)
print stmt
# Execute the statement and store all the records: results
results = connection.execute(stmt).fetchall()
# Print results
print(results)
print
# Print the keys/column names of the results returned
print(results[0].keys())
● Supports multiple columns to group by with a pa#ern similar to order_by()
● Requires all selected columns to be grouped or aggregated by a function
● SQLAlchemy auto generates “column names” for functions in the ResultSet
● The column names are o!en func_# such as count_1
● Replace them with the label() method
df2 = pd.DataFrame(results, columns=results[0].keys())
df2.head()
import matplotlib.pyplot as plt
%matplotlib inline
print df2.shape
df2[11:22].plot.barh(alpha=.6,color='g')
sns.set_context("notebook", font_scale=.5, )
import seaborn as sns
sns.barplot('State','population',data = df2.sort_values('population', ascending=False)[:5])
plt.figure(figsize=(15,6))
bar = sns.barplot('State','population',data = df2.sort_values('population'))
sns.set_context("notebook", font_scale=2, )
bar.set_xticklabels(df2['State'].values,rotation=90, size=10)
plt.show()