AIS Trade Estimation#
This notebook implements the AIS-derived trade estimation methodology from: Global economic impacts of COVID-19 lockdown measures stand out in high-frequency shipping data, Vesrchuur et al 2021
0.1 Load libraries
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
import os
from os.path import join
from glob import glob
import pandas as pd
import numpy as np
from port_call import create_port_calls
import boto3
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
pd.options.display.max_columns = None
0.2 Load AIS data
AIS data has been accessed through the UN Big Data Platform.
We have retrieved all AIS messages that intersect a 20 km. buffer from each port in Syria (Al Ladhiqiyah, Tartus, Baniyas), available from December 1st 2018 to August 31st 2022.
ais_dir = join(os.path.expanduser("~"), "data", "AIS")
data_dir = join(ais_dir, "Syria")
data_files = glob(data_dir + "/*.csv")
dfs = [pd.read_csv(f, index_col=0) for f in data_files]
df = pd.concat(dfs)
df.polygon_name.unique()
array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)
df_latakia = df.loc[df.polygon_name == "AL LADHIQIYAH"].copy()
df_tartus = df.loc[df.polygon_name == "TARTUS"].copy()
df_baniyas = df.loc[df.polygon_name == "BANIYAS"].copy()
start = "2018-12-01"
end = "2022-08-31"
country = "Syria"
1.2 Data Preparation#
Run port call algorithm that converts AIS messages into trips.
The port call algorithm sorts AIS data by MMSI (unique identified per ship) and date/time of AIS message. It then captures days with consecutive AIS messages and groups them into a single trip, calculating new attributes for each trip: turnaround-time (total time spent at port between arrival and departure), difference in reported draft, and difference in direction of travel.
trips = create_port_calls(df_latakia, start, end, "Latakia", "Syria")
len(trips)
793
Filter out trips that are not related to trade
Only cargo and tanker vessels
Turnaround time < 5 hours or > 95th percentile (refueling or maintenance)
Turnaround time < 10 hours and direction within 45 degrees (passing by)
Vessel types or subtypes that do not relate to trade
trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[
trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"
] = "maintenance"
trips.loc[
(trips.turn_around_time < 10) & (trips["heading-diff"].abs() < 45), "passing2"
] = "passing by"
trips.passing2.value_counts()
passing by 98
maintenance 40
refueling 20
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
635
data.vessel_type.unique()
array(['Cargo', 'UNAVAILABLE', 'Passenger', 'Unknown', 'Reserved'],
dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', 'General Cargo Ship', nan,
'Oil And Chemical Tanker', 'Bulk Carrier', 'Ro Ro Cargo Ship',
'Specialized Cargo Ship', 'Offshore Vessel'], dtype=object)
data.loc[data.vessel_type == "Reserved", "vessel_type_main"].unique()
array([nan], dtype=object)
data.loc[data.vessel_type == "Passenger", "vessel_type_main"].unique()
array(['Ro Ro Cargo Ship', nan], dtype=object)
drop_types = ["Reserved", "Passenger"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
627
data.vessel_type_sub.unique()
array([nan, 'Oil Products Tanker', 'Vehicles Carrier',
'Livestock Carrier', 'Offshore Tug Supply Ship'], dtype=object)
drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
len(data)
625
1.3 Predict DWT#
Match vessel information from Fleetmon database, and impute missing values
data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype("int")
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi | dwt | length_design | width_design | draught_design | block | vessel_type_AIS | sub_type | sub_vessel_type_AIS | |
---|---|---|---|---|---|---|---|---|---|
0 | 667001408 | 82.0 | 34.5 | 11.008 | 2.1 | 0.67 | Cargo | Container Ship | Container |
1 | 351863000 | 90.0 | 30.5 | 9.919 | 3.5 | 0.67 | Cargo | Container Ship | Container |
vessels.rename(
columns={
"length_design": "length",
"width_design": "width",
"vessel_type_AIS": "vessel_type",
"sub_vessel_type_AIS": "sub_vessel_type",
},
inplace=True,
)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype("int")
vessels_filt = vessels[
["sub_type", "vessel_type", "mmsi", "dwt", "length", "width", "draught_design"]
].copy()
# vessels_filt = vessels[['sub_vessel_type', 'vessel_type', 'mmsi', 'dwt', 'gt', 'length', 'width', 'Draft', 'flag']].copy()
vessels_filt.loc[:, "vessel_info"] = 1
data_join = data.merge(
vessels_filt, on="mmsi", how="left", suffixes=["_ais", "_vessel"]
)
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0 619
0.0 6
Name: vessel_info, dtype: int64
Prepare attributes for later use
block_coefficients = {"bulk": 0.79, "container": 0.73, "tanker": 0.83, "LNG": 0.79}
# groups = {
# 'Dry Bulk':['Bulk carrier'],
# 'Container':['Container ship'],
# 'General Cargo':['General cargo vessel', 'Cargo ship'],
# 'Reefer':['Reefer'],
# 'Vehicle carrier':['Vehicle carrier'],
# 'Ro Ro Cargo Ship':['RoRo ship'],
# 'Animal products':['livestock carrier'],
# 'Forest':['Forest-producer carrier']
# }
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join["draught-diff"] == 0, "draught_delta"] = 0
data_join.draught_delta.value_counts() / len(data_join)
0 0.6192
1 0.3808
Name: draught_delta, dtype: float64
Only 38% have some draught delta
data_join.loc[:, "draft_ais_max"] = data_join[["draught-in", "draught-out"]].max(axis=1)
train = data_join[(data_join.vessel_info == 1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train) + len(df_missing) == len(data_join)
True
(
train.length_vessel.isna().sum(),
train.width_vessel.isna().sum(),
train.draught_design.isna().sum(),
train.dwt.isna().sum(),
)
(0, 0, 0, 0)
train.dwt.astype("int")
0 13623
1 20624
2 17728
3 35181
4 13623
...
620 31760
621 31760
622 35220
623 31760
624 9389
Name: dwt, Length: 619, dtype: int64
y = list(train.dwt.astype("int"))
X = [
[row.length_vessel, row.width_vessel, row.draught_design]
for idx, row in train.iterrows()
]
len(y) == len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators=500, random_state=1)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL: 0.7806451612903226
R-Squared OF THE MODEL: 0.9944090737438522
X_missing = [
[row.length_ais, row.width_ais, row.draft_ais_max]
for idx, row in df_missing.iterrows()
]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), "dwt"] = df_missing.dwt
df = data_join.copy()
Add block coefficient category
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
'heading-diff', 'passing', 'port-name', 'country', 'passing2',
'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
'draft_ais_max'],
dtype='object')
df.loc[:, "block_cat"] = ""
df.loc[:, "vessel_type_best"] = df.sub_type # df.sub_vessel_type
df.vessel_type_best.isna().sum()
177
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_vessel"
] # vessel_type_vessel
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_sub"
]
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_main"
]
df.vessel_type_best.isna().sum()
5
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_ais"
]
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains("container"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("bulk"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("tanker"), "block_cat"] = "tanker"
df.loc[df.vessel_type_best.str.lower().str.contains("lng"), "block_cat"] = "LNG"
df.loc[df.block_cat == "", "vessel_type_best"].unique()
array(['General Cargo Ship', 'Cargo', 'Vehicles Carrier',
'Livestock Carrier'], dtype=object)
df.loc[df.vessel_type_best.str.lower().str.contains("cargo"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("roro"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("vehicle"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("forest"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("livestock"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("reefer"), "block_cat"] = (
"container"
)
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)
Get draft differences from next
aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client("s3")
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [
os.path.join("s3://", aws_bucket, content["Key"])
for content in file_list["Contents"]
]
# bucket_files
df_next = pd.read_csv(
"s3://wbgdecinternal-ntl/Andres_Temp/AIS/Latakia_next_draught.csv", index_col=0
)
len(df_next)
308
df_orig = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
df_next = df_next.join(df_orig[["turn_around_time", "date-leave"]])
df = df.merge(
df_next[["mmsi", "draught", "heading", "date-leave", "turn_around_time"]],
on=["mmsi", "date-leave", "turn_around_time"],
how="left",
).copy()
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "draught-out"] = df.loc[
(df.draught_delta == 0) & (~df.draught.isna()), "draught"
]
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "heading-out"] = df.loc[
(df.draught_delta == 0) & (~df.draught.isna()), "heading"
]
df["draught-diff"] = df["draught-out"] - df["draught-in"]
df.loc[df["draught-diff"] != 0, "draught_delta"] = 1
df.draught_delta.value_counts()
1 482
0 143
Name: draught_delta, dtype: int64
Calculate payload
df.loc[df["draught-in"] == 0, "draught_delta"] = 0
df.loc[df["draught-out"] == 0, "draught_delta"] = 0
df.loc[df["draught-in"] == 0, "draught-diff"] = 0
df.loc[df["draught-out"] == 0, "draught-diff"] = 0
df.loc[:, "draft_max"] = df[["draught_design", "draught-in", "draught-out"]].max(axis=1)
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
'heading-diff', 'passing', 'port-name', 'country', 'passing2',
'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
'draft_ais_max', 'block_cat', 'vessel_type_best', 'draught', 'heading',
'draft_max', 'length', 'width', 'payload_in', 'payload_out'],
dtype='object')
df.loc[:, "length"] = df.loc[:, "length_vessel"]
df.loc[df.length.isna(), "length"] = df.loc[df.length.isna(), "length_ais"]
df.loc[:, "width"] = df.loc[:, "width_vessel"]
df.loc[df.width.isna(), "width"] = df.loc[df.width.isna(), "width_ais"]
def calculate_payload(row, direction):
# get parameters
Din = row["draught-in"]
Dout = row["draught-out"]
if Din == Dout:
return pd.NA
else:
L = row["length"]
W = row["width"]
Dd = row["draft_max"]
DWT = row["dwt"]
Cb = block_coefficients[row["block_cat"]]
pw = 1.025 # tons/m3 1029kg/m3
if direction == "in":
Dr = Din
if direction == "out":
Dr = Dout
# calcualte block coefficient for reported draft
Cbr = 1 - ((1 - Cb) * ((Dr / Dd) ** (1 / 3)))
# calcualte payload rate
payload = ((((Cbr * Dr) - (Cb * Dd)) * (L * W * pw)) + DWT) / DWT
return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, "in"), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, "out"), axis=1)
df.head(2)
date-leave | mmsi | turn_around_time | datetime-leave | draught-out | heading-out | seconds | datetime-entry | date-entry | vessel_type_ais | dtg | length_ais | width_ais | draught-in | heading-in | vessel_type_main | vessel_type_sub | draught-diff | heading-diff | passing | port-name | country | passing2 | sub_type | vessel_type_vessel | dwt | length_vessel | width_vessel | draught_design | vessel_info | draught_delta | draft_ais_max | block_cat | vessel_type_best | draught | heading | draft_max | length | width | payload_in | payload_out | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-07 | 271044398 | 31.154722 | 2019-01-07 15:06:36 | 6.4 | 0.0 | 112157.0 | 2019-01-06 07:57:19 | 2019-01-06 | Cargo | 2019-01-06 07:39:16 | 152.0 | 24.0 | 8.9 | 0.0 | Container Ship | NaN | -2.5 | 0.0 | 3 | Latakia | Syria | NaN | Container Ship | Cargo | 13623.000000 | 151.0 | 24.0 | 8.25 | 1.0 | 1 | 8.9 | container | Container Ship | 6.4 | 0.0 | 8.9 | 151.0 | 24.0 | 1.0 | 0.551421 |
1 | 2019-01-10 | 271040403 | 10.353056 | 2019-01-10 19:54:11 | 9.3 | 48.0 | 37271.0 | 2019-01-10 09:33:00 | 2019-01-10 | Cargo | 2019-01-10 09:33:00 | 184.0 | 25.0 | 9.3 | 48.0 | Container Ship | NaN | 0.0 | 0.0 | 3 | Latakia | Syria | NaN | Container Ship | Cargo | 20624.133195 | 184.0 | 25.0 | 9.40 | 1.0 | 0 | 9.3 | container | Container Ship | NaN | NaN | 9.4 | 184.0 | 25.0 | <NA> | <NA> |
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
delta = row["draught-diff"]
if delta < 0:
import_val = (row.payload_in * row.dwt) - (row.payload_out * row.dwt)
export_val = 0
elif delta > 0:
import_val = 0
export_val = (row.payload_out * row.dwt) - (row.payload_in * row.dwt)
df.loc[idx, "import_val"] = import_val
df.loc[idx, "export_val"] = export_val
df_sub = df.loc[df.draught_delta == 1].copy()
avg_flow = df_sub[["mmsi", "import_val", "export_val"]].groupby("mmsi").mean()
avg_flow.loc[avg_flow.import_val > avg_flow.export_val, "import_avg"] = (
avg_flow.import_val
)
avg_flow.loc[avg_flow.import_val < avg_flow.export_val, "export_avg"] = (
avg_flow.export_val
)
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(["import_val", "export_val"], axis=1, inplace=True)
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
for idx, row in df.iterrows():
delta = row["draught-diff"]
if delta == 0:
try:
avg = avg_flow.loc[row.mmsi]
df.loc[idx, "import_val"] = avg.import_avg # import_val
df.loc[idx, "export_val"] = avg.export_avg # export_val
except:
df.loc[idx, "import_val"] = 0
df.loc[idx, "export_val"] = 0
# import_val = fr_im*row.dwt
# export_val = fr_exp*row.dwt
len(df.loc[df.draught_delta == 0])
149
len(df.loc[(df.draught_delta == 0) & (df.import_val > 0)])
90
len(df.loc[(df.draught_delta == 0) & (df.export_val > 0)])
6
df_latakia_trade = df.copy()
# df_final = df_latakia_trade.copy()
# df_final['date-entry']
# df_final.loc[:, "ym"] =
# df_final[['port-name', 'date-entry', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum().to_csv(join(ais_dir, "preliminary_results_latakia_11.22.v3.csv"))
Run Tartus#
1.2 Data Prep#
Run port call algorithm that converts AIS messages into trips
trips = create_port_calls(df_tartus, start, end, "Tartus", "Syria")
len(trips)
307
Filter out non-trade trips
trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[
trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"
] = "maintenance"
trips.loc[
(trips.turn_around_time < 10) & (trips["heading-diff"].abs() < 45), "passing2"
] = "passing by"
trips.passing2.value_counts()
passing by 86
maintenance 16
refueling 11
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
194
data.vessel_type.unique()
array(['Cargo', 'Tanker', 'Other', 'Passenger', 'SAR', 'Tug',
'UNAVAILABLE', 'Unknown', 'Towing'], dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', nan, 'General Cargo Ship', 'Bulk Carrier',
'Other Tanker', 'Specialized Cargo Ship', 'Ro Ro Cargo Ship',
'Oil And Chemical Tanker'], dtype=object)
drop_types = ["Reserved", "Passenger", "Passenger", "Tug", "Towing", "SAR"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
185
data.vessel_type_sub.unique()
array([nan, 'Vegetable Oil Tanker', 'Livestock Carrier',
'Chemical Oil Products Tanker', 'Chemical Tanker'], dtype=object)
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
# data.loc[664]
1.3 Predict DWT#
Match vessel information from Fleetmon database, and impute missing values
data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype("int")
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi | dwt | length_design | width_design | draught_design | block | vessel_type_AIS | sub_type | sub_vessel_type_AIS | |
---|---|---|---|---|---|---|---|---|---|
0 | 667001408 | 82.0 | 34.5 | 11.008 | 2.1 | 0.67 | Cargo | Container Ship | Container |
1 | 351863000 | 90.0 | 30.5 | 9.919 | 3.5 | 0.67 | Cargo | Container Ship | Container |
vessels.rename(
columns={
"length_design": "length",
"width_design": "width",
"vessel_type_AIS": "vessel_type",
"sub_vessel_type_AIS": "sub_vessel_type",
},
inplace=True,
)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype("int")
vessels_filt = vessels[
["sub_type", "vessel_type", "mmsi", "dwt", "length", "width", "draught_design"]
].copy()
vessels_filt.loc[:, "vessel_info"] = 1
data_join = data.merge(
vessels_filt, on="mmsi", how="left", suffixes=["_ais", "_vessel"]
)
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0 157
0.0 28
Name: vessel_info, dtype: int64
data_join.vessel_type_vessel.unique()
array(['Cargo', 'Tanker', nan], dtype=object)
For later use
block_coefficients = {"bulk": 0.79, "container": 0.73, "tanker": 0.83, "LNG": 0.79}
groups = {
"Dry Bulk": ["Bulk carrier"],
"Container": ["Container ship"],
"General Cargo": ["General cargo vessel", "Cargo ship"],
"Reefer": ["Reefer"],
"Vehicle carrier": ["Vehicle carrier"],
"Ro Ro Cargo Ship": ["RoRo ship"],
"Animal products": ["livestock carrier"],
"Forest": ["Forest-producer carrier"],
}
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data_join = data_join.loc[~(data_join.vessel_type_sub.isin(drop_sub_types))].copy()
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join["draught-diff"] == 0, "draught_delta"] = 0
data_join.draught_delta.value_counts() / len(data_join)
0 0.848649
1 0.151351
Name: draught_delta, dtype: float64
data_join.draught_delta.value_counts()
0 157
1 28
Name: draught_delta, dtype: int64
data_join.loc[:, "draft_ais_max"] = data_join[["draught-in", "draught-out"]].max(axis=1)
# data_join[data_join.dwt.isna()].iloc[0]
# data_join[data_join.draft_ais_max==0]
train = data_join[(data_join.vessel_info == 1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train) + len(df_missing) == len(data_join)
True
(
train.length_vessel.isna().sum(),
train.width_vessel.isna().sum(),
train.draught_design.isna().sum(),
train.dwt.isna().sum(),
)
(0, 0, 0, 0)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
y = list(train.dwt.astype("int"))
X = [
[row.length_vessel, row.width_vessel, row.draught_design]
for idx, row in train.iterrows()
]
len(y) == len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators=500, random_state=1)
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL: 0.7
R-Squared OF THE MODEL: 0.9772705948715836
# df_missing.loc[df_missing.length_ais==0]
X_missing = [
[row.length_ais, row.width_ais, row.draft_ais_max]
for idx, row in df_missing.iterrows()
]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), "dwt"] = df_missing.dwt
# data_join = data_join.loc[data_join.draught_delta==1].copy()
# data_join = data_join.loc[data_join.vessel_info==1].copy()
# data_join['draught-diff'].describe()
df = data_join.copy()
Add block coefficient category
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
'heading-diff', 'passing', 'port-name', 'country', 'passing2',
'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
'draft_ais_max'],
dtype='object')
df.loc[:, "block_cat"] = ""
df.loc[:, "vessel_type_best"] = df.sub_type
df.vessel_type_best.isna().sum()
80
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_vessel"
]
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_sub"
]
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_main"
]
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
df.vessel_type_best.isna(), "vessel_type_ais"
]
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains("container"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("bulk"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("tanker"), "block_cat"] = "tanker"
df.loc[df.vessel_type_best.str.lower().str.contains("lng"), "block_cat"] = "LNG"
df.loc[df.block_cat == "", "vessel_type_best"].unique()
array([], dtype=object)
# df.loc[df.block_cat=='bulk', "sub_vessel_type"].unique()
# df.loc[df.block_cat=='container', "sub_vessel_type"].unique()
# df.loc[df.vessel_type_best=='Other', ]
df.loc[df.vessel_type_best.str.lower().str.contains("cargo"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("roro"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("vehicle"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("forest"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("livestock"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("reefer"), "block_cat"] = (
"container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("other"), "block_cat"] = "container"
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)
Get draft differences from next
aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client("s3")
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [
os.path.join("s3://", aws_bucket, content["Key"])
for content in file_list["Contents"]
]
# bucket_files
df_next = pd.read_csv(
"s3://wbgdecinternal-ntl/Andres_Temp/AIS/Tartus_next_draught.csv", index_col=0
)
len(df_next)
67
df_orig = pd.read_csv(join(ais_dir, "port_calls_tartus.csv"), index_col=0)
df_orig[["turn_around_time", "date-leave"]].head(2)
turn_around_time | date-leave | |
---|---|---|
0 | 21.360278 | 2018-12-20 |
1 | 10.200833 | 2019-01-15 |
df_next = df_next.join(df_orig[["turn_around_time", "date-leave"]])
df = df.merge(
df_next[["mmsi", "draught", "heading", "date-leave", "turn_around_time"]],
on=["mmsi", "date-leave", "turn_around_time"],
how="left",
).copy()
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "draught-out"] = df.loc[
(df.draught_delta == 0) & (~df.draught.isna()), "draught"
]
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "heading-out"] = df.loc[
(df.draught_delta == 0) & (~df.draught.isna()), "heading"
]
df["draught-diff"] = df["draught-out"] - df["draught-in"]
df.loc[df["draught-diff"] != 0, "draught_delta"] = 1
df.draught_delta.value_counts()
0 111
1 74
Name: draught_delta, dtype: int64
df.loc[df["draught-in"] == 0, "draught_delta"] = 0
df.loc[df["draught-out"] == 0, "draught_delta"] = 0
df.loc[df["draught-in"] == 0, "draught-diff"] = 0
df.loc[df["draught-out"] == 0, "draught-diff"] = 0
df.loc[:, "draft_max"] = df[["draught_design", "draught-in", "draught-out"]].max(axis=1)
df.loc[:, "length"] = df.loc[:, "length_vessel"]
df.loc[df.length.isna(), "length"] = df.loc[df.length.isna(), "length_ais"]
df.loc[:, "width"] = df.loc[:, "width_vessel"]
df.loc[df.width.isna(), "width"] = df.loc[df.width.isna(), "width_ais"]
def calculate_payload(row, direction):
# get parameters
Din = row["draught-in"]
Dout = row["draught-out"]
if Din == Dout:
return pd.NA
else:
L = row["length"]
W = row["width"]
Dd = row["draft_max"]
DWT = row["dwt"]
Cb = block_coefficients[row["block_cat"]]
pw = 1.025 # tons/m3 1029kg/m3
if direction == "in":
Dr = Din
if direction == "out":
Dr = Dout
# calcualte block coefficient for reported draft
Cbr = 1 - ((1 - Cb) * ((Dr / Dd) ** (1 / 3)))
# calcualte payload rate
payload = ((((Cbr * Dr) - (Cb * Dd)) * (L * W * pw)) + DWT) / DWT
return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, "in"), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, "out"), axis=1)
# df.loc[:, ['payload_in', 'payload_out']]
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
delta = row["draught-diff"]
if delta < 0:
import_val = (row.payload_in * row.dwt) - (row.payload_out * row.dwt)
export_val = 0
elif delta > 0:
import_val = 0
export_val = (row.payload_out * row.dwt) - (row.payload_in * row.dwt)
df.loc[idx, "import_val"] = import_val
df.loc[idx, "export_val"] = export_val
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df_sub = df.loc[df.draught_delta == 1].copy()
avg_flow = df_sub[["mmsi", "import_val", "export_val"]].groupby("mmsi").mean()
avg_flow.loc[avg_flow.import_val > avg_flow.export_val, "import_avg"] = (
avg_flow.import_val
)
avg_flow.loc[avg_flow.import_val < avg_flow.export_val, "export_avg"] = (
avg_flow.export_val
)
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(["import_val", "export_val"], axis=1, inplace=True)
len(avg_flow)
26
for idx, row in df.iterrows():
delta = row["draught-diff"]
if delta == 0:
try:
avg = avg_flow.loc[row.mmsi]
df.loc[idx, "import_val"] = avg.import_avg # import_val
df.loc[idx, "export_val"] = avg.export_avg # export_val
except:
df.loc[idx, "import_val"] = 0
df.loc[idx, "export_val"] = 0
len(df.loc[df.draught_delta == 0])
115
len(df.loc[(df.draught_delta == 0) & (df.import_val > 0)])
22
len(df.loc[(df.draught_delta == 0) & (df.export_val > 0)])
3
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
# df.loc[:, "export_val"] = 0
# df.loc[:, "import_val"] = 0
# for idx, row in df.iterrows():
# delta = row['draught-diff']
# if delta < 0:
# import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
# export_val = 0
# elif delta > 0:
# import_val = 0
# export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
# elif delta == 0:
# if fr_im>fr_exp:
# fr_import = fr_im-fr_exp
# fr_export = 0
# elif fr_im<fr_exp:
# fr_export = fr_exp-fr_im
# fr_import = 0
# import_val = fr_import*row.dwt
# export_val = fr_export*row.dwt
# df.loc[idx, 'import_val'] = import_val
# df.loc[idx, 'export_val'] = export_val
df_tartus_trade = df.copy()
Asseble both
df_final = pd.concat([df_latakia_trade, df_tartus_trade])
data_dir
'/home/wb514197/data/AIS/Syria'
df_final["date"] = pd.to_datetime(df_final["date-entry"])
df_final["ym"] = df_final["date"].dt.strftime("%Y-%m")
len(df_final)
810
len(df_final.loc[(df_final.export_val == 0) & (df_final.import_val == 0)])
143
df_final = df_final.loc[
~((df_final.export_val == 0) & (df_final.import_val == 0))
].copy()
df_final = (
df_final[["port-name", "date-entry", "ym", "export_val", "import_val"]]
.groupby(["port-name", "date-entry"])
.sum()
)
df_final.reset_index(inplace=True)
df_final["date-entry"] = pd.to_datetime(df_final["date-entry"])
df_final["ym"] = df_final["date-entry"].dt.strftime("%Y-%m")
df_final.to_csv(join(ais_dir, "final_results_12.13.csv"))