AIS Trade Estimation#

0.1 Load libraries

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
import os
from os.path import join
from glob import glob
import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import Point
import folium.plugins as plugins
import seaborn as sns
from matplotlib import pyplot as plt

import numpy as np
import datetime
from datetime import timedelta
from port_call import create_port_calls
import boto3

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics 
pd.options.display.max_columns = None

0.2 Load AIS data

AIS data has been accessed through the UN Big Data Platform.

We have retrieved all AIS messages that intersect a 20 km. buffer from each port in Syria (Al Ladhiqiyah, Tartus, Baniyas), available from December 1st 2018 to August 31st 2022.

ais_dir = join(os.path.expanduser("~"), 'data', 'AIS')
data_dir = join(ais_dir, 'Syria')
data_files = glob(data_dir+"/*.csv")
dfs = [pd.read_csv(f, index_col=0) for f in data_files]
df = pd.concat(dfs)
df.polygon_name.unique()
array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)
df_latakia = df.loc[df.polygon_name=="AL LADHIQIYAH"].copy()
df_tartus = df.loc[df.polygon_name=="TARTUS"].copy()
df_baniyas = df.loc[df.polygon_name=="BANIYAS"].copy()
start = '2018-12-01'
end = '2022-08-31'
country = "Syria"

1.2 Data Preparation#

Run port call algorithm that converts AIS messages into trips.

The port call algorithm sorts AIS data by MMSI (unique identified per ship) and date/time of AIS message. It then captures days with consecutive AIS messages and groups them into a single trip, calculating new attributes for each trip: turnaround-time (total time spent at port between arrival and departure), difference in reported draft, and difference in direction of travel.

trips = create_port_calls(df_latakia, start, end, "Latakia", "Syria")
len(trips)
793

Filter out trips that are not related to trade

  • Only cargo and tanker vessels

  • Turnaround time < 5 hours or > 95th percentile (refueling or maintenance)

  • Turnaround time < 10 hours and direction within 45 degrees (passing by)

  • Vessel types or subtypes that do not relate to trade

trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"] = "maintenance"
trips.loc[(trips.turn_around_time < 10) & (trips['heading-diff'].abs()<45), "passing2"] = "passing by"
trips.passing2.value_counts()
passing by     98
maintenance    40
refueling      20
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
635
data.vessel_type.unique()
array(['Cargo', 'UNAVAILABLE', 'Passenger', 'Unknown', 'Reserved'],
      dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', 'General Cargo Ship', nan,
       'Oil And Chemical Tanker', 'Bulk Carrier', 'Ro Ro Cargo Ship',
       'Specialized Cargo Ship', 'Offshore Vessel'], dtype=object)
data.loc[data.vessel_type=="Reserved", "vessel_type_main"].unique()
array([nan], dtype=object)
data.loc[data.vessel_type=="Passenger", "vessel_type_main"].unique()
array(['Ro Ro Cargo Ship', nan], dtype=object)
drop_types = ["Reserved", "Passenger"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
627
data.vessel_type_sub.unique()
array([nan, 'Oil Products Tanker', 'Vehicles Carrier',
       'Livestock Carrier', 'Offshore Tug Supply Ship'], dtype=object)
drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
len(data)
625

1.3 Predict DWT#

Match vessel information from Fleetmon database, and impute missing values

data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype('int')
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi dwt length_design width_design draught_design block vessel_type_AIS sub_type sub_vessel_type_AIS
0 667001408 82.0 34.5 11.008 2.1 0.67 Cargo Container Ship Container
1 351863000 90.0 30.5 9.919 3.5 0.67 Cargo Container Ship Container
vessels.rename(columns={
    'length_design':'length',
    'width_design':'width',
    'vessel_type_AIS':'vessel_type',
    'sub_vessel_type_AIS':'sub_vessel_type'
}, inplace=True)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype('int')
vessels_filt = vessels[['sub_type', 'vessel_type', 'mmsi', 'dwt', 'length', 'width', 'draught_design']].copy()
# vessels_filt = vessels[['sub_vessel_type', 'vessel_type', 'mmsi', 'dwt', 'gt', 'length', 'width', 'Draft', 'flag']].copy()
vessels_filt.loc[:, 'vessel_info'] = 1
data_join = data.merge(vessels_filt, on='mmsi', how='left', suffixes=['_ais', '_vessel'])
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0    619
0.0      6
Name: vessel_info, dtype: int64

Prepare attributes for later use

block_coefficients = {
    'bulk':0.79,
    'container':0.73,
    'tanker':0.83,
    'LNG':0.79
}
# groups = {
#     'Dry Bulk':['Bulk carrier'],
#     'Container':['Container ship'],
#     'General Cargo':['General cargo vessel', 'Cargo ship'],
#     'Reefer':['Reefer'],
#     'Vehicle carrier':['Vehicle carrier'],
#     'Ro Ro Cargo Ship':['RoRo ship'],
#     'Animal products':['livestock carrier'],
#     'Forest':['Forest-producer carrier']
# }
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join['draught-diff']==0, "draught_delta"] = 0
data_join.draught_delta.value_counts()/len(data_join)
0    0.6192
1    0.3808
Name: draught_delta, dtype: float64

Only 38% have some draught delta

data_join.loc[:, 'draft_ais_max'] = data_join[['draught-in', 'draught-out']].max(axis=1)
train = data_join[(data_join.vessel_info==1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train)+len(df_missing)== len(data_join)
True
train.length_vessel.isna().sum(), train.width_vessel.isna().sum(), train.draught_design.isna().sum(), train.dwt.isna().sum()
(0, 0, 0, 0)
train.dwt.astype('int')
0      13623
1      20624
2      17728
3      35181
4      13623
       ...  
620    31760
621    31760
622    35220
623    31760
624     9389
Name: dwt, Length: 619, dtype: int64
y = list(train.dwt.astype('int'))
X = [[row.length_vessel, row.width_vessel, row.draught_design] for idx, row in train.iterrows()]
len(y)==len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 500, random_state=1) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL:  0.7806451612903226
R-Squared OF THE MODEL:  0.9944090737438522
X_missing = [[row.length_ais, row.width_ais, row.draft_ais_max] for idx, row in df_missing.iterrows()]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), 'dwt'] = df_missing.dwt
df = data_join.copy()

Add block coefficient category

df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max'],
      dtype='object')
df.loc[:, 'block_cat'] = ""
df.loc[:, 'vessel_type_best'] = df.sub_type #df.sub_vessel_type
df.vessel_type_best.isna().sum()
177
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_vessel'] # vessel_type_vessel
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_sub']
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_main']
df.vessel_type_best.isna().sum()
5
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_ais']
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains('container'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('bulk'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('tanker'), "block_cat"] = 'tanker'
df.loc[df.vessel_type_best.str.lower().str.contains('lng'), "block_cat"] = 'LNG'
df.loc[df.block_cat=='', "vessel_type_best"].unique()
array(['General Cargo Ship', 'Cargo', 'Vehicles Carrier',
       'Livestock Carrier'], dtype=object)
df.loc[df.vessel_type_best.str.lower().str.contains('cargo'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('roro'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('vehicle'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('forest'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('livestock'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('reefer'), "block_cat"] = 'container'
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)

Get draft differences from next

aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client('s3')
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [os.path.join("s3://", aws_bucket, content['Key']) for content in file_list['Contents']]
# bucket_files
df_next = pd.read_csv('s3://wbgdecinternal-ntl/Andres_Temp/AIS/Latakia_next_draught.csv', index_col=0)
len(df_next)
308
df_orig = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
df_next = df_next.join(df_orig[['turn_around_time', 'date-leave']])
df = df.merge(df_next[['mmsi', 'draught', 'heading', 'date-leave', 'turn_around_time']], on=['mmsi', 'date-leave', 'turn_around_time'], how='left').copy()
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught"]
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading"]
df['draught-diff'] = df['draught-out'] - df['draught-in']
df.loc[df['draught-diff']!=0, "draught_delta"] = 1
df.draught_delta.value_counts()
1    482
0    143
Name: draught_delta, dtype: int64

Calculate payload

df.loc[df['draught-in']==0, 'draught_delta'] = 0
df.loc[df['draught-out']==0, 'draught_delta'] = 0
df.loc[df['draught-in']==0, 'draught-diff'] = 0
df.loc[df['draught-out']==0, 'draught-diff'] = 0
df.loc[:, 'draft_max'] = df[['draught_design', 'draught-in', 'draught-out']].max(axis=1)
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max', 'block_cat', 'vessel_type_best', 'draught', 'heading',
       'draft_max', 'length', 'width', 'payload_in', 'payload_out'],
      dtype='object')
df.loc[:, 'length'] = df.loc[:, 'length_vessel']
df.loc[df.length.isna(), 'length'] = df.loc[df.length.isna(), 'length_ais']
df.loc[:, 'width'] = df.loc[:, 'width_vessel']
df.loc[df.width.isna(), 'width'] = df.loc[df.width.isna(), 'width_ais']
def calculate_payload(row, direction):
    
    # get parameters
    Din = row['draught-in']
    Dout = row['draught-out']
    if Din==Dout:
        return pd.NA
    else:
        L = row['length']
        W = row['width']
        Dd = row['draft_max']
        DWT = row['dwt']
        Cb = block_coefficients[row['block_cat']]
        pw = 1.025 # tons/m3  1029kg/m3

        if direction=="in":
            Dr = Din
        if direction =='out':
            Dr = Dout

        # calcualte block coefficient for reported draft
        Cbr = 1 - ((1 - Cb)*((Dr / Dd)**(1/3)))

        # calcualte payload rate
        payload = ((((Cbr*Dr) - (Cb*Dd)) * (L*W*pw)) + DWT ) / DWT
        return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, 'in'), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, 'out'), axis=1)
df.head(2)
date-leave mmsi turn_around_time datetime-leave draught-out heading-out seconds datetime-entry date-entry vessel_type_ais dtg length_ais width_ais draught-in heading-in vessel_type_main vessel_type_sub draught-diff heading-diff passing port-name country passing2 sub_type vessel_type_vessel dwt length_vessel width_vessel draught_design vessel_info draught_delta draft_ais_max block_cat vessel_type_best draught heading draft_max length width payload_in payload_out
0 2019-01-07 271044398 31.154722 2019-01-07 15:06:36 6.4 0.0 112157.0 2019-01-06 07:57:19 2019-01-06 Cargo 2019-01-06 07:39:16 152.0 24.0 8.9 0.0 Container Ship NaN -2.5 0.0 3 Latakia Syria NaN Container Ship Cargo 13623.000000 151.0 24.0 8.25 1.0 1 8.9 container Container Ship 6.4 0.0 8.9 151.0 24.0 1.0 0.551421
1 2019-01-10 271040403 10.353056 2019-01-10 19:54:11 9.3 48.0 37271.0 2019-01-10 09:33:00 2019-01-10 Cargo 2019-01-10 09:33:00 184.0 25.0 9.3 48.0 Container Ship NaN 0.0 0.0 3 Latakia Syria NaN Container Ship Cargo 20624.133195 184.0 25.0 9.40 1.0 0 9.3 container Container Ship NaN NaN 9.4 184.0 25.0 <NA> <NA>
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
    delta = row['draught-diff']
    if delta < 0:
        import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
        export_val = 0
    elif delta > 0:
        import_val = 0
        export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
    df.loc[idx, 'import_val'] = import_val
    df.loc[idx, 'export_val'] = export_val
df_sub = df.loc[df.draught_delta==1].copy()
avg_flow = df_sub[['mmsi', 'import_val', 'export_val']].groupby('mmsi').mean()
avg_flow.loc[avg_flow.import_val>avg_flow.export_val, "import_avg"] = avg_flow.import_val
avg_flow.loc[avg_flow.import_val<avg_flow.export_val, "export_avg"] = avg_flow.export_val
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(['import_val', 'export_val'], axis=1, inplace=True)
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
for idx, row in df.iterrows():
    delta = row['draught-diff']
    if delta == 0:
        try:
            avg = avg_flow.loc[row.mmsi]
            df.loc[idx, 'import_val'] = avg.import_avg # import_val
            df.loc[idx, 'export_val'] = avg.export_avg # export_val
        except:
            df.loc[idx, 'import_val'] = 0
            df.loc[idx, 'export_val'] = 0
#         import_val = fr_im*row.dwt
#         export_val = fr_exp*row.dwt
len(df.loc[df.draught_delta==0])
149
len(df.loc[(df.draught_delta==0) & (df.import_val>0)])
90
len(df.loc[(df.draught_delta==0) & (df.export_val>0)])
6
df_latakia_trade = df.copy()
# df_final = df_latakia_trade.copy()
# df_final['date-entry']
# df_final.loc[:, "ym"] = 
# df_final[['port-name', 'date-entry', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum().to_csv(join(ais_dir, "preliminary_results_latakia_11.22.v3.csv"))

Run Tartus#

1.2 Data Prep#

Run port call algorithm that converts AIS messages into trips

trips = create_port_calls(df_tartus, start, end, "Tartus", "Syria")
len(trips)
307

Filter out non-trade trips

trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"] = "maintenance"
trips.loc[(trips.turn_around_time < 10) & (trips['heading-diff'].abs()<45), "passing2"] = "passing by"
trips.passing2.value_counts()
passing by     86
maintenance    16
refueling      11
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
194
data.vessel_type.unique()
array(['Cargo', 'Tanker', 'Other', 'Passenger', 'SAR', 'Tug',
       'UNAVAILABLE', 'Unknown', 'Towing'], dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', nan, 'General Cargo Ship', 'Bulk Carrier',
       'Other Tanker', 'Specialized Cargo Ship', 'Ro Ro Cargo Ship',
       'Oil And Chemical Tanker'], dtype=object)
drop_types = ["Reserved", "Passenger", "Passenger", "Tug", "Towing", "SAR"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
185
data.vessel_type_sub.unique()
array([nan, 'Vegetable Oil Tanker', 'Livestock Carrier',
       'Chemical Oil Products Tanker', 'Chemical Tanker'], dtype=object)
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
# data.loc[664]

1.3 Predict DWT#

Match vessel information from Fleetmon database, and impute missing values

data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype('int')
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi dwt length_design width_design draught_design block vessel_type_AIS sub_type sub_vessel_type_AIS
0 667001408 82.0 34.5 11.008 2.1 0.67 Cargo Container Ship Container
1 351863000 90.0 30.5 9.919 3.5 0.67 Cargo Container Ship Container
vessels.rename(columns={
    'length_design':'length',
    'width_design':'width',
    'vessel_type_AIS':'vessel_type',
    'sub_vessel_type_AIS':'sub_vessel_type'
}, inplace=True)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype('int')
vessels_filt = vessels[['sub_type', 'vessel_type', 'mmsi', 'dwt', 'length', 'width', 'draught_design']].copy()
vessels_filt.loc[:, 'vessel_info'] = 1
data_join = data.merge(vessels_filt, on='mmsi', how='left', suffixes=['_ais', '_vessel'])
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0    157
0.0     28
Name: vessel_info, dtype: int64
data_join.vessel_type_vessel.unique()
array(['Cargo', 'Tanker', nan], dtype=object)

For later use

block_coefficients = {
    'bulk':0.79,
    'container':0.73,
    'tanker':0.83,
    'LNG':0.79
}
groups = {
    'Dry Bulk':['Bulk carrier'],
    'Container':['Container ship'],
    'General Cargo':['General cargo vessel', 'Cargo ship'],
    'Reefer':['Reefer'],
    'Vehicle carrier':['Vehicle carrier'],
    'Ro Ro Cargo Ship':['RoRo ship'],
    'Animal products':['livestock carrier'],
    'Forest':['Forest-producer carrier']
}
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data_join = data_join.loc[~(data_join.vessel_type_sub.isin(drop_sub_types))].copy()
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join['draught-diff']==0, "draught_delta"] = 0
data_join.draught_delta.value_counts()/len(data_join)
0    0.848649
1    0.151351
Name: draught_delta, dtype: float64
data_join.draught_delta.value_counts()
0    157
1     28
Name: draught_delta, dtype: int64
data_join.loc[:, 'draft_ais_max'] = data_join[['draught-in', 'draught-out']].max(axis=1)
# data_join[data_join.dwt.isna()].iloc[0]
# data_join[data_join.draft_ais_max==0]
train = data_join[(data_join.vessel_info==1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train)+len(df_missing)== len(data_join)
True
train.length_vessel.isna().sum(), train.width_vessel.isna().sum(), train.draught_design.isna().sum(), train.dwt.isna().sum()
(0, 0, 0, 0)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics 
y = list(train.dwt.astype('int'))
X = [[row.length_vessel, row.width_vessel, row.draught_design] for idx, row in train.iterrows()]
len(y)==len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 500, random_state=1) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL:  0.7
R-Squared OF THE MODEL:  0.9772705948715836
# df_missing.loc[df_missing.length_ais==0]
X_missing = [[row.length_ais, row.width_ais, row.draft_ais_max] for idx, row in df_missing.iterrows()]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), 'dwt'] = df_missing.dwt
# data_join = data_join.loc[data_join.draught_delta==1].copy()
# data_join = data_join.loc[data_join.vessel_info==1].copy()
# data_join['draught-diff'].describe()
df = data_join.copy()

Add block coefficient category

df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max'],
      dtype='object')
df.loc[:, 'block_cat'] = ""
df.loc[:, 'vessel_type_best'] = df.sub_type
df.vessel_type_best.isna().sum()
80
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_vessel']
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_sub']
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_main']
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_ais']
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains('container'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('bulk'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('tanker'), "block_cat"] = 'tanker'
df.loc[df.vessel_type_best.str.lower().str.contains('lng'), "block_cat"] = 'LNG'
df.loc[df.block_cat=='', "vessel_type_best"].unique()
array([], dtype=object)
# df.loc[df.block_cat=='bulk', "sub_vessel_type"].unique()
# df.loc[df.block_cat=='container', "sub_vessel_type"].unique()
# df.loc[df.vessel_type_best=='Other', ]
df.loc[df.vessel_type_best.str.lower().str.contains('cargo'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('roro'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('vehicle'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('forest'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('livestock'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('reefer'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('other'), "block_cat"] = 'container'
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)

Get draft differences from next

aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client('s3')
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [os.path.join("s3://", aws_bucket, content['Key']) for content in file_list['Contents']]
# bucket_files
df_next = pd.read_csv('s3://wbgdecinternal-ntl/Andres_Temp/AIS/Tartus_next_draught.csv', index_col=0)
len(df_next)
67
df_orig = pd.read_csv(join(ais_dir, "port_calls_tartus.csv"), index_col=0)
df_orig[['turn_around_time', 'date-leave']].head(2)
turn_around_time date-leave
0 21.360278 2018-12-20
1 10.200833 2019-01-15
df_next = df_next.join(df_orig[['turn_around_time', 'date-leave']])
df = df.merge(df_next[['mmsi', 'draught', 'heading', 'date-leave', 'turn_around_time']], on=['mmsi', 'date-leave', 'turn_around_time'], how='left').copy()
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught"]
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading"]
df['draught-diff'] = df['draught-out'] - df['draught-in']
df.loc[df['draught-diff']!=0, "draught_delta"] = 1
df.draught_delta.value_counts()
0    111
1     74
Name: draught_delta, dtype: int64
df.loc[df['draught-in']==0, 'draught_delta'] = 0
df.loc[df['draught-out']==0, 'draught_delta'] = 0
df.loc[df['draught-in']==0, 'draught-diff'] = 0
df.loc[df['draught-out']==0, 'draught-diff'] = 0
df.loc[:, 'draft_max'] = df[['draught_design', 'draught-in', 'draught-out']].max(axis=1)
df.loc[:, 'length'] = df.loc[:, 'length_vessel']
df.loc[df.length.isna(), 'length'] = df.loc[df.length.isna(), 'length_ais']
df.loc[:, 'width'] = df.loc[:, 'width_vessel']
df.loc[df.width.isna(), 'width'] = df.loc[df.width.isna(), 'width_ais']
def calculate_payload(row, direction):
    
    # get parameters
    Din = row['draught-in']
    Dout = row['draught-out']
    if Din==Dout:
        return pd.NA
    else:
        L = row['length']
        W = row['width']
        Dd = row['draft_max']
        DWT = row['dwt']
        Cb = block_coefficients[row['block_cat']]
        pw = 1.025 # tons/m3  1029kg/m3

        if direction=="in":
            Dr = Din
        if direction =='out':
            Dr = Dout

        # calcualte block coefficient for reported draft
        Cbr = 1 - ((1 - Cb)*((Dr / Dd)**(1/3)))

        # calcualte payload rate
        payload = ((((Cbr*Dr) - (Cb*Dd)) * (L*W*pw)) + DWT ) / DWT
        return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, 'in'), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, 'out'), axis=1)
# df.loc[:, ['payload_in', 'payload_out']]
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
    delta = row['draught-diff']
    if delta < 0:
        import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
        export_val = 0
    elif delta > 0:
        import_val = 0
        export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
    df.loc[idx, 'import_val'] = import_val
    df.loc[idx, 'export_val'] = export_val
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df_sub = df.loc[df.draught_delta==1].copy()
avg_flow = df_sub[['mmsi', 'import_val', 'export_val']].groupby('mmsi').mean()
avg_flow.loc[avg_flow.import_val>avg_flow.export_val, "import_avg"] = avg_flow.import_val
avg_flow.loc[avg_flow.import_val<avg_flow.export_val, "export_avg"] = avg_flow.export_val
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(['import_val', 'export_val'], axis=1, inplace=True)
len(avg_flow)
26
for idx, row in df.iterrows():
    delta = row['draught-diff']
    if delta == 0:
        try:
            avg = avg_flow.loc[row.mmsi]
            df.loc[idx, 'import_val'] = avg.import_avg # import_val
            df.loc[idx, 'export_val'] = avg.export_avg # export_val
        except:
            df.loc[idx, 'import_val'] = 0
            df.loc[idx, 'export_val'] = 0
len(df.loc[df.draught_delta==0])
115
len(df.loc[(df.draught_delta==0) & (df.import_val>0)])
22
len(df.loc[(df.draught_delta==0) & (df.export_val>0)])
3
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
# df.loc[:, "export_val"] = 0
# df.loc[:, "import_val"] = 0
# for idx, row in df.iterrows():
#     delta = row['draught-diff']
#     if delta < 0:
#         import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
#         export_val = 0
#     elif delta > 0:
#         import_val = 0
#         export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
#     elif delta == 0:
#         if fr_im>fr_exp:
#             fr_import = fr_im-fr_exp
#             fr_export = 0
#         elif fr_im<fr_exp:
#             fr_export = fr_exp-fr_im
#             fr_import = 0
#         import_val = fr_import*row.dwt
#         export_val = fr_export*row.dwt
#     df.loc[idx, 'import_val'] = import_val
#     df.loc[idx, 'export_val'] = export_val
df_tartus_trade = df.copy()

Asseble both

df_final = pd.concat([df_latakia_trade, df_tartus_trade])
data_dir
'/home/wb514197/data/AIS/Syria'
df_final['date'] = pd.to_datetime(df_final['date-entry'])
df_final['ym'] = df_final['date'].dt.strftime('%Y-%m')
len(df_final)
810
len(df_final.loc[(df_final.export_val==0) & (df_final.import_val==0)])
143
df_final = df_final.loc[~ ((df_final.export_val==0) & (df_final.import_val==0))].copy()
df_final = df_final[['port-name', 'date-entry', 'ym', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum()
df_final.reset_index(inplace=True)
df_final['date-entry'] = pd.to_datetime(df_final['date-entry'])
df_final['ym'] = df_final['date-entry'].dt.strftime('%Y-%m')
df_final.to_csv(join(ais_dir, "final_results_12.13.csv"))