6. 03-regression#

Runs logistic regression
Step 31 of methods
Edit correct cells, then run all

#imports
import numpy as np
import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from datetime import date
import math
from shapely import ops
import requests
import json

KEY = "" #add personal key here

functions complilation

def rename_columns(df, prefix):
    
    for col in df.columns:
        new_col = prefix + col.replace(" ", "")
        df = df.rename(columns={col: new_col})
    return df

def call_line(coordinates, fields, timestep):
    #perform API call
    url = 'https://api.tomorrow.io/v4/timelines?apikey=' + KEY
    
    payload = {
        'units':'metric', 
        'timesteps': timestep, 
        #'location': point, 
        'fields': fields, 
        'timezone': 'America/New_York',
        #'startTime': (now - timedelta(days=1)).isoformat(),
        #'endTime': now.isoformat(),
        'location': {
            'type': 'LineString',
            'coordinates': coordinates},
    }
    
    headers = {
    "Accept": "application/json",
    "Content-Type": "application/json"
    }
    
    response = requests.request("POST", url, json=payload, headers=headers)
    return response

def format_response(response):
    text = json.loads(response.text)
    timelines = text['data']['timelines']
    nest = timelines[0]['intervals'] #list - each element is a dictionary with startTime and values
    startTimes = pd.DataFrame(nest)['startTime']

    val = nest[0]['values'] #first time-stamp value
    values = pd.DataFrame(val, index=[0])

    #iterate through all time-stamps
    for i in range(1, len(startTimes)):
        val = nest[i]['values']
        df = pd.DataFrame(val, index=[i])
        values = pd.concat([values, df], axis=0)

    data = pd.concat([startTimes, values], axis=1)
    return data

6.1. START HERE#

edit date_list to include all files that have been data processed
then run all cells

date_list = ['2021-06-20', '2021-06-21', '2021-06-23', '2021-06-26', '2021-06-27', '2021-06-28']

df = pd.DataFrame(index=range(0,2))
for i in date_list:
    print(i)
    closures = gpd.read_file('/home/jovyan/work/data/a-closures/closures-agg/' + i + '.gpkg')
    slope = pd.read_csv(str('/home/jovyan/work/data/c-dem/slope/' + i + '.csv'))
    aspect = pd.read_csv('/home/jovyan/work/data/c-dem/aspect/' + i + '.csv')
    elev = pd.read_csv('/home/jovyan/work/data/c-dem/elevation/' + i + '.csv')
    
    slope = rename_columns(slope, 'slope')
    aspect = rename_columns(aspect, 'aspect')
    elev = rename_columns(elev, 'elev')
    
    data = closures.merge(slope, left_index=True, right_index=True)
    data = data.merge(aspect, left_index=True, right_index=True)
    data = data.merge(elev, left_index=True, right_index=True)
    
    #this leaves out all road segments that are within 5 km of the closure, to account for geotagging/chainage errors. 
    drop_list = []
    for index, row in data.iterrows():
        status = row['status']
        if status == 1:
            before = list(range(int(index-5),int(index)))
            after = list(range(int(index+1),int(index+6)))
            for i in before:
                if i < 0: 
                    pass
                elif data.iloc[i, data.columns.get_loc("status")] == 0:
                    drop_list.append(i) 
            for i in after:
                if i >= len(data): 
                    pass
                elif data.iloc[i, data.columns.get_loc("status")] == 0:
                    drop_list.append(i) 
    
    drop_array = np.unique(np.array(drop_list))
    data = data.drop(drop_array)
    df = df.append(data)
df = df.dropna(how='all')

data = pd.DataFrame(df)
y = np.asarray(data['status'])
#X = data[['precip_1d', 'elevMaximum', 'slopeVariance', 'slopeStandarddeviation', 'aspectMean']]
X = data.drop(['status','date', 'datetime', 'geometry', 'segment_id', 'road_id', 'elevpixelcount', 'aspectpixelcount', 'slopepixelcount', 'elevZoneID', 'aspectZoneID', 'slopeZoneID'], axis=1)
#X = data.drop(['status','date', 'datetime', 'geometry', 'segment_id', 'road_id', 'elevpixelcount', 'aspectpixelcount', 'slopepixelcount', 'elevZoneID', 'aspectZoneID', 'slopeZoneID', 'elevMedian', 'elevMean', 'aspectMean', 'aspectVariance', 'aspectMedianabsolutedeviation', 'slopeMaximum', 'slopeMedian', 'aspectMinimum'], axis=1)
#X = np.asarray(X.loc[:, X.columns != 'geometry'])
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.083818
         Iterations 16
                               Results: Logit
=============================================================================
Model:                    Logit                Pseudo R-squared:     0.045   
Dependent Variable:       y                    AIC:                  410.1172
Date:                     2021-09-04 00:46     BIC:                  535.2733
No. Observations:         2184                 Log-Likelihood:       -183.06 
Df Model:                 21                   LL-Null:              -191.62 
Df Residuals:             2162                 LLR p-value:          0.70389 
Converged:                1.0000               Scale:                1.0000  
No. Iterations:           16.0000                                            
-----------------------------------------------------------------------------
                               Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------------------
precip_1d                     -0.0033   0.0021 -1.5613 0.1185 -0.0074  0.0008
slopeMean                      0.2705   0.2670  1.0131 0.3110 -0.2528  0.7938
slopeMedian                   -0.0795   0.2135 -0.3722 0.7098 -0.4980  0.3391
slopeStandarddeviation        -1.4604   0.4822 -3.0287 0.0025 -2.4054 -0.5153
slopeVariance                  0.0429   0.0164  2.6134 0.0090  0.0107  0.0751
slopeMinimum                  -0.1769   0.0702 -2.5220 0.0117 -0.3145 -0.0394
slopeMaximum                   0.0049   0.0641  0.0765 0.9390 -0.1208  0.1306
slopeMedianabsolutedeviation   0.4450   0.2838  1.5676 0.1170 -0.1114  1.0013
aspectMean                    -0.0020   0.0094 -0.2130 0.8313 -0.0204  0.0164
aspectMedian                   0.0069   0.0065  1.0509 0.2933 -0.0059  0.0197
aspectStandarddeviation       -0.0056   0.0094 -0.6021 0.5471 -0.0240  0.0127
aspectVariance                -0.0000   0.0000 -0.0522 0.9584 -0.0000  0.0000
aspectMinimum                 -0.0002   0.0003 -0.8464 0.3973 -0.0008  0.0003
aspectMaximum                 -0.0066   0.0044 -1.5072 0.1318 -0.0151  0.0020
aspectMedianabsolutedeviation  0.0055   0.0094  0.5878 0.5567 -0.0129  0.0240
elevMean                      -0.0012   0.0662 -0.0174 0.9861 -0.1310  0.1286
elevMedian                    -0.0151   0.0504 -0.2987 0.7652 -0.1139  0.0838
elevStandarddeviation          0.1599   0.0915  1.7483 0.0804 -0.0194  0.3392
elevVariance                  -0.0007   0.0004 -1.5545 0.1201 -0.0016  0.0002
elevMinimum                    0.0195   0.0170  1.1467 0.2515 -0.0138  0.0527
elevMaximum                   -0.0036   0.0131 -0.2777 0.7812 -0.0294  0.0221
elevMedianabsolutedeviation   -0.1011   0.0517 -1.9578 0.0503 -0.2024  0.0001
=============================================================================

data2= data[['road_id','status','segment_id','datetime','precip_1d','slopeMinimum', 'aspectMedian', 'aspectMaximum', 'elevStandarddeviation']]
data2.sample(n=5)

	road_id	status	segment_id	datetime	precip_1d	slopeMinimum	aspectMedian	aspectMaximum	elevStandarddeviation
194	57.0	0.0	37.0	2021-06-21T12:29:59	0.0	1.469081	58.443527	360.00000	62.872677
175	13.0	1.0	176.0	2021-06-20T12:29:59	9.0	13.249520	268.264280	359.69687	37.035800
132	57.0	0.0	56.0	2021-06-28T12:29:59	0.0	7.450240	210.172300	336.80140	64.716500
122	28.0	0.0	81.0	2021-06-21T12:29:59	0.0	3.062798	185.387100	359.25595	18.401546
115	28.0	0.0	74.0	2021-06-21T12:29:59	0.0	0.503988	207.665800	357.87890	21.763947

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

/opt/conda/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Accuracy of logistic regression classifier on test set: 0.99

6.2. DATA#

collects all the data for road 54

road = gpd.read_file('/home/jovyan/work/data/b-roads/road-split/54.gpkg')

slope = pd.read_csv('/home/jovyan/work/data/c-dem/slope/54.csv')
aspect = pd.read_csv('/home/jovyan/work/data/c-dem/aspect/54.csv')
elev = pd.read_csv('/home/jovyan/work/data/c-dem/elevation/54.csv')

slope = rename_columns(slope, 'slope')
aspect = rename_columns(aspect, 'aspect')
elev = rename_columns(elev, 'elev')

data = road.merge(slope, left_index=True, right_index=True)
data = data.merge(aspect, left_index=True, right_index=True)
data = data.merge(elev, left_index=True, right_index=True)

data = data.drop(['elevpixelcount', 'aspectpixelcount', 'slopepixelcount', 'elevZoneID', 'aspectZoneID', 'slopeZoneID'], axis=1)

#add precipitation data for segments 1-10 on road 54 in the next 24 hours. Only adding 10 segments' data at a time due to API call limit.
for index, row in data.head(10).iterrows():
    geom = row['geometry']
    geom_list = list(ops.linemerge(geom).coords)
    res = [list(ele) for ele in geom_list]
    response = call_line(res, ['precipitationIntensityAvg'], ['1h'])
    table = format_response(response)
    data.at[index, 'precip_1d'] = table.head(23).sum()['precipitationIntensityAvg']

data2 = data.drop(['road_id', 'date', 'status', 'segment_id', 'geometry', 'aspectMean'], axis=1)
data2 = data2.head(10)

#run prediction on fist 10 line segments
y_pred = logreg.predict(data2)
data3 = data.head(10)
data3['probability'] = y_pred
data3[['road_id', 'date', 'segment_id', 'geometry', 'precip_1d', 'probability']]

/opt/conda/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)

	road_id	date	segment_id	geometry	precip_1d
0	54	2021-07-12	1	MULTILINESTRING ((79.21719 30.26079, 79.21749 ...	8.1075
1	54	2021-07-12	2	MULTILINESTRING ((79.21827 30.25553, 79.21801 ...	8.1075
2	54	2021-07-12	3	MULTILINESTRING ((79.21520 30.25862, 79.21484 ...	8.1075
3	54	2021-07-12	4	MULTILINESTRING ((79.21620 30.25347, 79.21600 ...	18.5075
4	54	2021-07-12	5	MULTILINESTRING ((79.21417 30.24601, 79.21422 ...	28.9077
5	54	2021-07-12	6	MULTILINESTRING ((79.21905 30.24550, 79.21929 ...	28.9077
6	54	2021-07-12	7	MULTILINESTRING ((79.21855 30.24397, 79.21830 ...	28.9077
7	54	2021-07-12	8	MULTILINESTRING ((79.22114 30.24374, 79.22138 ...	28.9077
8	54	2021-07-12	9	MULTILINESTRING ((79.22994 30.24412, 79.22943 ...	28.9077
9	54	2021-07-12	10	MULTILINESTRING ((79.22076 30.24131, 79.22040 ...	28.9077

03-regression

Contents

6. 03-regression#

6.1. START HERE#

6.2. DATA#