Nationa Mortgage Database Project¶

Python: Data Extraction¶

  • Data Source: Nationa Mortgage Database
  • Dataset: Metro
  • Export Option: SQL database
  • Data Transformations:
  • Future changes:
In [1]:
# imports
import pandas as pd
import os
from datetime import date, timedelta
In [2]:
# Setup
os.chdir('/home/jovyan/work')

uri = 'https://www.fhfa.gov/DataTools/Downloads/Documents/NATIONAL-MORTGAGE-DATABASE-(NMDB)-AGGREGATE-DATA/nmdb-mortgage-performance-statistics-metro.csv'
In [3]:
# check if last updated is 24 hrs old
def is24hr_old(filename):
    epoch = os.path.getmtime(f'data/{filename}.csv')
    org = date.fromtimestamp(epoch)
    is_old = org + timedelta(days=1) < date.today()
    return is_old

# Cache dataset to local csv and refresh if older than 1 day
def refresh_data(filename):
    if f'{filename}.csv' not in os.listdir('data') or is24hr_old(filename):
        df = pd.read_csv(uri)
        df.to_csv(f'data/{filename}.csv')    
    else:
        df = pd.read_csv(f'data/{filename}.csv')
        
    return df
In [4]:
df = refresh_data('national_mortgage')
df.columns
Out[4]:
Index(['SOURCE', 'FREQUENCY', 'SERIES', 'SERIESID', 'GEOLEVEL', 'GEONAME',
       'STATEPOSTAL', 'STATEFIPS', 'METRO', 'MARKET', 'YEAR', 'QUARTER',
       'SUPPRESSED', 'VALUE'],
      dtype='object')
In [45]:
# filter for Seattle Wa Metropolitan Area
df = df[df['GEONAME'].str.contains('Seattle')]

df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1012 entries, 6723 to 99903
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SOURCE       1012 non-null   object 
 1   FREQUENCY    1012 non-null   object 
 2   SERIES       1012 non-null   object 
 3   SERIESID     1012 non-null   object 
 4   GEOLEVEL     1012 non-null   object 
 5   GEONAME      1012 non-null   object 
 6   STATEPOSTAL  0 non-null      float64
 7   STATEFIPS    0 non-null      float64
 8   METRO        1012 non-null   int64  
 9   MARKET       1012 non-null   object 
 10  YEAR         1012 non-null   int64  
 11  QUARTER      1012 non-null   int64  
 12  SUPPRESSED   1012 non-null   int64  
 13  VALUE        1012 non-null   float64
dtypes: float64(3), int64(4), object(7)
memory usage: 118.6+ KB
In [46]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://postgres:headband@192.168.0.151:5432/mydb')
In [48]:
def update_server(table_name):
    try:
        df.to_sql(table_name,con=engine,index=False,if_exists='replace')
        print('Successfully updated server')
    except:
        print('Got an error while updating server')
In [49]:
update_server('seattle_mortgage')
Successfully updated server