AIS Trade Estimation#
This notebook implements the AIS-derived trade estimation methodology from: Global economic impacts of COVID-19 lockdown measures stand out in high-frequency shipping data, Vesrchuur et al 2021
0.1 Load libraries
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
import os
from os.path import join
from glob import glob
import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import Point
import folium.plugins as plugins
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import datetime
from datetime import timedelta
from port_call import create_port_calls
import boto3
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
pd.options.display.max_columns = None
0.2 Load AIS data
AIS data has been accessed through the UN Big Data Platform.
We have retrieved all AIS messages that intersect a 20 km. buffer from each port in Syria (Al Ladhiqiyah, Tartus, Baniyas), available from December 1st 2018 to August 31st 2022.
ais_dir = join(os.path.expanduser("~"), 'data', 'AIS')
data_dir = join(ais_dir, 'Syria')
data_files = glob(data_dir+"/*.csv")
dfs = [pd.read_csv(f, index_col=0) for f in data_files]
df = pd.concat(dfs)
df.polygon_name.unique()
array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)
df_latakia = df.loc[df.polygon_name=="AL LADHIQIYAH"].copy()
df_tartus = df.loc[df.polygon_name=="TARTUS"].copy()
df_baniyas = df.loc[df.polygon_name=="BANIYAS"].copy()
start = '2018-12-01'
end = '2022-08-31'
country = "Syria"
1.2 Data Preparation#
Run port call algorithm that converts AIS messages into trips.
The port call algorithm sorts AIS data by MMSI (unique identified per ship) and date/time of AIS message. It then captures days with consecutive AIS messages and groups them into a single trip, calculating new attributes for each trip: turnaround-time (total time spent at port between arrival and departure), difference in reported draft, and difference in direction of travel.
trips = create_port_calls(df_latakia, start, end, "Latakia", "Syria")
len(trips)
793
Filter out trips that are not related to trade
Only cargo and tanker vessels
Turnaround time < 5 hours or > 95th percentile (refueling or maintenance)
Turnaround time < 10 hours and direction within 45 degrees (passing by)
Vessel types or subtypes that do not relate to trade
trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"] = "maintenance"
trips.loc[(trips.turn_around_time < 10) & (trips['heading-diff'].abs()<45), "passing2"] = "passing by"
trips.passing2.value_counts()
passing by 98
maintenance 40
refueling 20
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
635
data.vessel_type.unique()
array(['Cargo', 'UNAVAILABLE', 'Passenger', 'Unknown', 'Reserved'],
dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', 'General Cargo Ship', nan,
'Oil And Chemical Tanker', 'Bulk Carrier', 'Ro Ro Cargo Ship',
'Specialized Cargo Ship', 'Offshore Vessel'], dtype=object)
data.loc[data.vessel_type=="Reserved", "vessel_type_main"].unique()
array([nan], dtype=object)
data.loc[data.vessel_type=="Passenger", "vessel_type_main"].unique()
array(['Ro Ro Cargo Ship', nan], dtype=object)
drop_types = ["Reserved", "Passenger"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
627
data.vessel_type_sub.unique()
array([nan, 'Oil Products Tanker', 'Vehicles Carrier',
'Livestock Carrier', 'Offshore Tug Supply Ship'], dtype=object)
drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
len(data)
625
1.3 Predict DWT#
Match vessel information from Fleetmon database, and impute missing values
data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype('int')
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi | dwt | length_design | width_design | draught_design | block | vessel_type_AIS | sub_type | sub_vessel_type_AIS | |
---|---|---|---|---|---|---|---|---|---|
0 | 667001408 | 82.0 | 34.5 | 11.008 | 2.1 | 0.67 | Cargo | Container Ship | Container |
1 | 351863000 | 90.0 | 30.5 | 9.919 | 3.5 | 0.67 | Cargo | Container Ship | Container |
vessels.rename(columns={
'length_design':'length',
'width_design':'width',
'vessel_type_AIS':'vessel_type',
'sub_vessel_type_AIS':'sub_vessel_type'
}, inplace=True)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype('int')
vessels_filt = vessels[['sub_type', 'vessel_type', 'mmsi', 'dwt', 'length', 'width', 'draught_design']].copy()
# vessels_filt = vessels[['sub_vessel_type', 'vessel_type', 'mmsi', 'dwt', 'gt', 'length', 'width', 'Draft', 'flag']].copy()
vessels_filt.loc[:, 'vessel_info'] = 1
data_join = data.merge(vessels_filt, on='mmsi', how='left', suffixes=['_ais', '_vessel'])
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0 619
0.0 6
Name: vessel_info, dtype: int64
Prepare attributes for later use
block_coefficients = {
'bulk':0.79,
'container':0.73,
'tanker':0.83,
'LNG':0.79
}
# groups = {
# 'Dry Bulk':['Bulk carrier'],
# 'Container':['Container ship'],
# 'General Cargo':['General cargo vessel', 'Cargo ship'],
# 'Reefer':['Reefer'],
# 'Vehicle carrier':['Vehicle carrier'],
# 'Ro Ro Cargo Ship':['RoRo ship'],
# 'Animal products':['livestock carrier'],
# 'Forest':['Forest-producer carrier']
# }
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join['draught-diff']==0, "draught_delta"] = 0
data_join.draught_delta.value_counts()/len(data_join)
0 0.6192
1 0.3808
Name: draught_delta, dtype: float64
Only 38% have some draught delta
data_join.loc[:, 'draft_ais_max'] = data_join[['draught-in', 'draught-out']].max(axis=1)
train = data_join[(data_join.vessel_info==1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train)+len(df_missing)== len(data_join)
True
train.length_vessel.isna().sum(), train.width_vessel.isna().sum(), train.draught_design.isna().sum(), train.dwt.isna().sum()
(0, 0, 0, 0)
train.dwt.astype('int')
0 13623
1 20624
2 17728
3 35181
4 13623
...
620 31760
621 31760
622 35220
623 31760
624 9389
Name: dwt, Length: 619, dtype: int64
y = list(train.dwt.astype('int'))
X = [[row.length_vessel, row.width_vessel, row.draught_design] for idx, row in train.iterrows()]
len(y)==len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 500, random_state=1)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL: 0.7806451612903226
R-Squared OF THE MODEL: 0.9944090737438522
X_missing = [[row.length_ais, row.width_ais, row.draft_ais_max] for idx, row in df_missing.iterrows()]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), 'dwt'] = df_missing.dwt
df = data_join.copy()
Add block coefficient category
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
'heading-diff', 'passing', 'port-name', 'country', 'passing2',
'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
'draft_ais_max'],
dtype='object')
df.loc[:, 'block_cat'] = ""
df.loc[:, 'vessel_type_best'] = df.sub_type #df.sub_vessel_type
df.vessel_type_best.isna().sum()
177
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_vessel'] # vessel_type_vessel
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_sub']
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_main']
df.vessel_type_best.isna().sum()
5
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_ais']
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains('container'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('bulk'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('tanker'), "block_cat"] = 'tanker'
df.loc[df.vessel_type_best.str.lower().str.contains('lng'), "block_cat"] = 'LNG'
df.loc[df.block_cat=='', "vessel_type_best"].unique()
array(['General Cargo Ship', 'Cargo', 'Vehicles Carrier',
'Livestock Carrier'], dtype=object)
df.loc[df.vessel_type_best.str.lower().str.contains('cargo'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('roro'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('vehicle'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('forest'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('livestock'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('reefer'), "block_cat"] = 'container'
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)
Get draft differences from next
aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client('s3')
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [os.path.join("s3://", aws_bucket, content['Key']) for content in file_list['Contents']]
# bucket_files
df_next = pd.read_csv('s3://wbgdecinternal-ntl/Andres_Temp/AIS/Latakia_next_draught.csv', index_col=0)
len(df_next)
308
df_orig = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
df_next = df_next.join(df_orig[['turn_around_time', 'date-leave']])
df = df.merge(df_next[['mmsi', 'draught', 'heading', 'date-leave', 'turn_around_time']], on=['mmsi', 'date-leave', 'turn_around_time'], how='left').copy()
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught"]
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading"]
df['draught-diff'] = df['draught-out'] - df['draught-in']
df.loc[df['draught-diff']!=0, "draught_delta"] = 1
df.draught_delta.value_counts()
1 482
0 143
Name: draught_delta, dtype: int64
Calculate payload
df.loc[df['draught-in']==0, 'draught_delta'] = 0
df.loc[df['draught-out']==0, 'draught_delta'] = 0
df.loc[df['draught-in']==0, 'draught-diff'] = 0
df.loc[df['draught-out']==0, 'draught-diff'] = 0
df.loc[:, 'draft_max'] = df[['draught_design', 'draught-in', 'draught-out']].max(axis=1)
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
'heading-diff', 'passing', 'port-name', 'country', 'passing2',
'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
'draft_ais_max', 'block_cat', 'vessel_type_best', 'draught', 'heading',
'draft_max', 'length', 'width', 'payload_in', 'payload_out'],
dtype='object')
df.loc[:, 'length'] = df.loc[:, 'length_vessel']
df.loc[df.length.isna(), 'length'] = df.loc[df.length.isna(), 'length_ais']
df.loc[:, 'width'] = df.loc[:, 'width_vessel']
df.loc[df.width.isna(), 'width'] = df.loc[df.width.isna(), 'width_ais']
def calculate_payload(row, direction):
# get parameters
Din = row['draught-in']
Dout = row['draught-out']
if Din==Dout:
return pd.NA
else:
L = row['length']
W = row['width']
Dd = row['draft_max']
DWT = row['dwt']
Cb = block_coefficients[row['block_cat']]
pw = 1.025 # tons/m3 1029kg/m3
if direction=="in":
Dr = Din
if direction =='out':
Dr = Dout
# calcualte block coefficient for reported draft
Cbr = 1 - ((1 - Cb)*((Dr / Dd)**(1/3)))
# calcualte payload rate
payload = ((((Cbr*Dr) - (Cb*Dd)) * (L*W*pw)) + DWT ) / DWT
return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, 'in'), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, 'out'), axis=1)
df.head(2)
date-leave | mmsi | turn_around_time | datetime-leave | draught-out | heading-out | seconds | datetime-entry | date-entry | vessel_type_ais | dtg | length_ais | width_ais | draught-in | heading-in | vessel_type_main | vessel_type_sub | draught-diff | heading-diff | passing | port-name | country | passing2 | sub_type | vessel_type_vessel | dwt | length_vessel | width_vessel | draught_design | vessel_info | draught_delta | draft_ais_max | block_cat | vessel_type_best | draught | heading | draft_max | length | width | payload_in | payload_out | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-07 | 271044398 | 31.154722 | 2019-01-07 15:06:36 | 6.4 | 0.0 | 112157.0 | 2019-01-06 07:57:19 | 2019-01-06 | Cargo | 2019-01-06 07:39:16 | 152.0 | 24.0 | 8.9 | 0.0 | Container Ship | NaN | -2.5 | 0.0 | 3 | Latakia | Syria | NaN | Container Ship | Cargo | 13623.000000 | 151.0 | 24.0 | 8.25 | 1.0 | 1 | 8.9 | container | Container Ship | 6.4 | 0.0 | 8.9 | 151.0 | 24.0 | 1.0 | 0.551421 |
1 | 2019-01-10 | 271040403 | 10.353056 | 2019-01-10 19:54:11 | 9.3 | 48.0 | 37271.0 | 2019-01-10 09:33:00 | 2019-01-10 | Cargo | 2019-01-10 09:33:00 | 184.0 | 25.0 | 9.3 | 48.0 | Container Ship | NaN | 0.0 | 0.0 | 3 | Latakia | Syria | NaN | Container Ship | Cargo | 20624.133195 | 184.0 | 25.0 | 9.40 | 1.0 | 0 | 9.3 | container | Container Ship | NaN | NaN | 9.4 | 184.0 | 25.0 | <NA> | <NA> |
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
delta = row['draught-diff']
if delta < 0:
import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
export_val = 0
elif delta > 0:
import_val = 0
export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
df.loc[idx, 'import_val'] = import_val
df.loc[idx, 'export_val'] = export_val
df_sub = df.loc[df.draught_delta==1].copy()
avg_flow = df_sub[['mmsi', 'import_val', 'export_val']].groupby('mmsi').mean()
avg_flow.loc[avg_flow.import_val>avg_flow.export_val, "import_avg"] = avg_flow.import_val
avg_flow.loc[avg_flow.import_val<avg_flow.export_val, "export_avg"] = avg_flow.export_val
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(['import_val', 'export_val'], axis=1, inplace=True)
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
for idx, row in df.iterrows():
delta = row['draught-diff']
if delta == 0:
try:
avg = avg_flow.loc[row.mmsi]
df.loc[idx, 'import_val'] = avg.import_avg # import_val
df.loc[idx, 'export_val'] = avg.export_avg # export_val
except:
df.loc[idx, 'import_val'] = 0
df.loc[idx, 'export_val'] = 0
# import_val = fr_im*row.dwt
# export_val = fr_exp*row.dwt
len(df.loc[df.draught_delta==0])
149
len(df.loc[(df.draught_delta==0) & (df.import_val>0)])
90
len(df.loc[(df.draught_delta==0) & (df.export_val>0)])
6
df_latakia_trade = df.copy()
# df_final = df_latakia_trade.copy()
# df_final['date-entry']
# df_final.loc[:, "ym"] =
# df_final[['port-name', 'date-entry', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum().to_csv(join(ais_dir, "preliminary_results_latakia_11.22.v3.csv"))
Run Tartus#
1.2 Data Prep#
Run port call algorithm that converts AIS messages into trips
trips = create_port_calls(df_tartus, start, end, "Tartus", "Syria")
len(trips)
307
Filter out non-trade trips
trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"] = "maintenance"
trips.loc[(trips.turn_around_time < 10) & (trips['heading-diff'].abs()<45), "passing2"] = "passing by"
trips.passing2.value_counts()
passing by 86
maintenance 16
refueling 11
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
194
data.vessel_type.unique()
array(['Cargo', 'Tanker', 'Other', 'Passenger', 'SAR', 'Tug',
'UNAVAILABLE', 'Unknown', 'Towing'], dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', nan, 'General Cargo Ship', 'Bulk Carrier',
'Other Tanker', 'Specialized Cargo Ship', 'Ro Ro Cargo Ship',
'Oil And Chemical Tanker'], dtype=object)
drop_types = ["Reserved", "Passenger", "Passenger", "Tug", "Towing", "SAR"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
185
data.vessel_type_sub.unique()
array([nan, 'Vegetable Oil Tanker', 'Livestock Carrier',
'Chemical Oil Products Tanker', 'Chemical Tanker'], dtype=object)
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
# data.loc[664]
1.3 Predict DWT#
Match vessel information from Fleetmon database, and impute missing values
data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype('int')
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi | dwt | length_design | width_design | draught_design | block | vessel_type_AIS | sub_type | sub_vessel_type_AIS | |
---|---|---|---|---|---|---|---|---|---|
0 | 667001408 | 82.0 | 34.5 | 11.008 | 2.1 | 0.67 | Cargo | Container Ship | Container |
1 | 351863000 | 90.0 | 30.5 | 9.919 | 3.5 | 0.67 | Cargo | Container Ship | Container |
vessels.rename(columns={
'length_design':'length',
'width_design':'width',
'vessel_type_AIS':'vessel_type',
'sub_vessel_type_AIS':'sub_vessel_type'
}, inplace=True)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype('int')
vessels_filt = vessels[['sub_type', 'vessel_type', 'mmsi', 'dwt', 'length', 'width', 'draught_design']].copy()
vessels_filt.loc[:, 'vessel_info'] = 1
data_join = data.merge(vessels_filt, on='mmsi', how='left', suffixes=['_ais', '_vessel'])
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0 157
0.0 28
Name: vessel_info, dtype: int64
data_join.vessel_type_vessel.unique()
array(['Cargo', 'Tanker', nan], dtype=object)
For later use
block_coefficients = {
'bulk':0.79,
'container':0.73,
'tanker':0.83,
'LNG':0.79
}
groups = {
'Dry Bulk':['Bulk carrier'],
'Container':['Container ship'],
'General Cargo':['General cargo vessel', 'Cargo ship'],
'Reefer':['Reefer'],
'Vehicle carrier':['Vehicle carrier'],
'Ro Ro Cargo Ship':['RoRo ship'],
'Animal products':['livestock carrier'],
'Forest':['Forest-producer carrier']
}
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data_join = data_join.loc[~(data_join.vessel_type_sub.isin(drop_sub_types))].copy()
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join['draught-diff']==0, "draught_delta"] = 0
data_join.draught_delta.value_counts()/len(data_join)
0 0.848649
1 0.151351
Name: draught_delta, dtype: float64
data_join.draught_delta.value_counts()
0 157
1 28
Name: draught_delta, dtype: int64
data_join.loc[:, 'draft_ais_max'] = data_join[['draught-in', 'draught-out']].max(axis=1)
# data_join[data_join.dwt.isna()].iloc[0]
# data_join[data_join.draft_ais_max==0]
train = data_join[(data_join.vessel_info==1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train)+len(df_missing)== len(data_join)
True
train.length_vessel.isna().sum(), train.width_vessel.isna().sum(), train.draught_design.isna().sum(), train.dwt.isna().sum()
(0, 0, 0, 0)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
y = list(train.dwt.astype('int'))
X = [[row.length_vessel, row.width_vessel, row.draught_design] for idx, row in train.iterrows()]
len(y)==len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 500, random_state=1)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL: 0.7
R-Squared OF THE MODEL: 0.9772705948715836
# df_missing.loc[df_missing.length_ais==0]
X_missing = [[row.length_ais, row.width_ais, row.draft_ais_max] for idx, row in df_missing.iterrows()]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), 'dwt'] = df_missing.dwt
# data_join = data_join.loc[data_join.draught_delta==1].copy()
# data_join = data_join.loc[data_join.vessel_info==1].copy()
# data_join['draught-diff'].describe()
df = data_join.copy()
Add block coefficient category
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
'heading-diff', 'passing', 'port-name', 'country', 'passing2',
'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
'draft_ais_max'],
dtype='object')
df.loc[:, 'block_cat'] = ""
df.loc[:, 'vessel_type_best'] = df.sub_type
df.vessel_type_best.isna().sum()
80
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_vessel']
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_sub']
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_main']
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), 'vessel_type_best'] = df.loc[df.vessel_type_best.isna(), 'vessel_type_ais']
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains('container'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('bulk'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('tanker'), "block_cat"] = 'tanker'
df.loc[df.vessel_type_best.str.lower().str.contains('lng'), "block_cat"] = 'LNG'
df.loc[df.block_cat=='', "vessel_type_best"].unique()
array([], dtype=object)
# df.loc[df.block_cat=='bulk', "sub_vessel_type"].unique()
# df.loc[df.block_cat=='container', "sub_vessel_type"].unique()
# df.loc[df.vessel_type_best=='Other', ]
df.loc[df.vessel_type_best.str.lower().str.contains('cargo'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('roro'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('vehicle'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('forest'), "block_cat"] = 'bulk'
df.loc[df.vessel_type_best.str.lower().str.contains('livestock'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('reefer'), "block_cat"] = 'container'
df.loc[df.vessel_type_best.str.lower().str.contains('other'), "block_cat"] = 'container'
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)
Get draft differences from next
aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client('s3')
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [os.path.join("s3://", aws_bucket, content['Key']) for content in file_list['Contents']]
# bucket_files
df_next = pd.read_csv('s3://wbgdecinternal-ntl/Andres_Temp/AIS/Tartus_next_draught.csv', index_col=0)
len(df_next)
67
df_orig = pd.read_csv(join(ais_dir, "port_calls_tartus.csv"), index_col=0)
df_orig[['turn_around_time', 'date-leave']].head(2)
turn_around_time | date-leave | |
---|---|---|
0 | 21.360278 | 2018-12-20 |
1 | 10.200833 | 2019-01-15 |
df_next = df_next.join(df_orig[['turn_around_time', 'date-leave']])
df = df.merge(df_next[['mmsi', 'draught', 'heading', 'date-leave', 'turn_around_time']], on=['mmsi', 'date-leave', 'turn_around_time'], how='left').copy()
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "draught"]
df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading-out"] = df.loc[(df.draught_delta==0) & (~df.draught.isna()), "heading"]
df['draught-diff'] = df['draught-out'] - df['draught-in']
df.loc[df['draught-diff']!=0, "draught_delta"] = 1
df.draught_delta.value_counts()
0 111
1 74
Name: draught_delta, dtype: int64
df.loc[df['draught-in']==0, 'draught_delta'] = 0
df.loc[df['draught-out']==0, 'draught_delta'] = 0
df.loc[df['draught-in']==0, 'draught-diff'] = 0
df.loc[df['draught-out']==0, 'draught-diff'] = 0
df.loc[:, 'draft_max'] = df[['draught_design', 'draught-in', 'draught-out']].max(axis=1)
df.loc[:, 'length'] = df.loc[:, 'length_vessel']
df.loc[df.length.isna(), 'length'] = df.loc[df.length.isna(), 'length_ais']
df.loc[:, 'width'] = df.loc[:, 'width_vessel']
df.loc[df.width.isna(), 'width'] = df.loc[df.width.isna(), 'width_ais']
def calculate_payload(row, direction):
# get parameters
Din = row['draught-in']
Dout = row['draught-out']
if Din==Dout:
return pd.NA
else:
L = row['length']
W = row['width']
Dd = row['draft_max']
DWT = row['dwt']
Cb = block_coefficients[row['block_cat']]
pw = 1.025 # tons/m3 1029kg/m3
if direction=="in":
Dr = Din
if direction =='out':
Dr = Dout
# calcualte block coefficient for reported draft
Cbr = 1 - ((1 - Cb)*((Dr / Dd)**(1/3)))
# calcualte payload rate
payload = ((((Cbr*Dr) - (Cb*Dd)) * (L*W*pw)) + DWT ) / DWT
return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, 'in'), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, 'out'), axis=1)
# df.loc[:, ['payload_in', 'payload_out']]
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
delta = row['draught-diff']
if delta < 0:
import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
export_val = 0
elif delta > 0:
import_val = 0
export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
df.loc[idx, 'import_val'] = import_val
df.loc[idx, 'export_val'] = export_val
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df_sub = df.loc[df.draught_delta==1].copy()
avg_flow = df_sub[['mmsi', 'import_val', 'export_val']].groupby('mmsi').mean()
avg_flow.loc[avg_flow.import_val>avg_flow.export_val, "import_avg"] = avg_flow.import_val
avg_flow.loc[avg_flow.import_val<avg_flow.export_val, "export_avg"] = avg_flow.export_val
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(['import_val', 'export_val'], axis=1, inplace=True)
len(avg_flow)
26
for idx, row in df.iterrows():
delta = row['draught-diff']
if delta == 0:
try:
avg = avg_flow.loc[row.mmsi]
df.loc[idx, 'import_val'] = avg.import_avg # import_val
df.loc[idx, 'export_val'] = avg.export_avg # export_val
except:
df.loc[idx, 'import_val'] = 0
df.loc[idx, 'export_val'] = 0
len(df.loc[df.draught_delta==0])
115
len(df.loc[(df.draught_delta==0) & (df.import_val>0)])
22
len(df.loc[(df.draught_delta==0) & (df.export_val>0)])
3
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
# df.loc[:, "export_val"] = 0
# df.loc[:, "import_val"] = 0
# for idx, row in df.iterrows():
# delta = row['draught-diff']
# if delta < 0:
# import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
# export_val = 0
# elif delta > 0:
# import_val = 0
# export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
# elif delta == 0:
# if fr_im>fr_exp:
# fr_import = fr_im-fr_exp
# fr_export = 0
# elif fr_im<fr_exp:
# fr_export = fr_exp-fr_im
# fr_import = 0
# import_val = fr_import*row.dwt
# export_val = fr_export*row.dwt
# df.loc[idx, 'import_val'] = import_val
# df.loc[idx, 'export_val'] = export_val
df_tartus_trade = df.copy()
Asseble both
df_final = pd.concat([df_latakia_trade, df_tartus_trade])
data_dir
'/home/wb514197/data/AIS/Syria'
df_final['date'] = pd.to_datetime(df_final['date-entry'])
df_final['ym'] = df_final['date'].dt.strftime('%Y-%m')
len(df_final)
810
len(df_final.loc[(df_final.export_val==0) & (df_final.import_val==0)])
143
df_final = df_final.loc[~ ((df_final.export_val==0) & (df_final.import_val==0))].copy()
df_final = df_final[['port-name', 'date-entry', 'ym', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum()
df_final.reset_index(inplace=True)
df_final['date-entry'] = pd.to_datetime(df_final['date-entry'])
df_final['ym'] = df_final['date-entry'].dt.strftime('%Y-%m')
df_final.to_csv(join(ais_dir, "final_results_12.13.csv"))