AIS Trade Estimation#

0.1 Load libraries

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
import os
from os.path import join
from glob import glob
import pandas as pd

import numpy as np
from port_call import create_port_calls
import boto3

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
pd.options.display.max_columns = None

0.2 Load AIS data

AIS data has been accessed through the UN Big Data Platform.

We have retrieved all AIS messages that intersect a 20 km. buffer from each port in Syria (Al Ladhiqiyah, Tartus, Baniyas), available from December 1st 2018 to August 31st 2022.

ais_dir = join(os.path.expanduser("~"), "data", "AIS")
data_dir = join(ais_dir, "Syria")
data_files = glob(data_dir + "/*.csv")
dfs = [pd.read_csv(f, index_col=0) for f in data_files]
df = pd.concat(dfs)
df.polygon_name.unique()
array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)
df_latakia = df.loc[df.polygon_name == "AL LADHIQIYAH"].copy()
df_tartus = df.loc[df.polygon_name == "TARTUS"].copy()
df_baniyas = df.loc[df.polygon_name == "BANIYAS"].copy()
start = "2018-12-01"
end = "2022-08-31"
country = "Syria"

1.2 Data Preparation#

Run port call algorithm that converts AIS messages into trips.

The port call algorithm sorts AIS data by MMSI (unique identified per ship) and date/time of AIS message. It then captures days with consecutive AIS messages and groups them into a single trip, calculating new attributes for each trip: turnaround-time (total time spent at port between arrival and departure), difference in reported draft, and difference in direction of travel.

trips = create_port_calls(df_latakia, start, end, "Latakia", "Syria")
len(trips)
793

Filter out trips that are not related to trade

  • Only cargo and tanker vessels

  • Turnaround time < 5 hours or > 95th percentile (refueling or maintenance)

  • Turnaround time < 10 hours and direction within 45 degrees (passing by)

  • Vessel types or subtypes that do not relate to trade

trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[
    trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"
] = "maintenance"
trips.loc[
    (trips.turn_around_time < 10) & (trips["heading-diff"].abs() < 45), "passing2"
] = "passing by"
trips.passing2.value_counts()
passing by     98
maintenance    40
refueling      20
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
635
data.vessel_type.unique()
array(['Cargo', 'UNAVAILABLE', 'Passenger', 'Unknown', 'Reserved'],
      dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', 'General Cargo Ship', nan,
       'Oil And Chemical Tanker', 'Bulk Carrier', 'Ro Ro Cargo Ship',
       'Specialized Cargo Ship', 'Offshore Vessel'], dtype=object)
data.loc[data.vessel_type == "Reserved", "vessel_type_main"].unique()
array([nan], dtype=object)
data.loc[data.vessel_type == "Passenger", "vessel_type_main"].unique()
array(['Ro Ro Cargo Ship', nan], dtype=object)
drop_types = ["Reserved", "Passenger"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
627
data.vessel_type_sub.unique()
array([nan, 'Oil Products Tanker', 'Vehicles Carrier',
       'Livestock Carrier', 'Offshore Tug Supply Ship'], dtype=object)
drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
len(data)
625

1.3 Predict DWT#

Match vessel information from Fleetmon database, and impute missing values

data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype("int")
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi dwt length_design width_design draught_design block vessel_type_AIS sub_type sub_vessel_type_AIS
0 667001408 82.0 34.5 11.008 2.1 0.67 Cargo Container Ship Container
1 351863000 90.0 30.5 9.919 3.5 0.67 Cargo Container Ship Container
vessels.rename(
    columns={
        "length_design": "length",
        "width_design": "width",
        "vessel_type_AIS": "vessel_type",
        "sub_vessel_type_AIS": "sub_vessel_type",
    },
    inplace=True,
)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype("int")
vessels_filt = vessels[
    ["sub_type", "vessel_type", "mmsi", "dwt", "length", "width", "draught_design"]
].copy()
# vessels_filt = vessels[['sub_vessel_type', 'vessel_type', 'mmsi', 'dwt', 'gt', 'length', 'width', 'Draft', 'flag']].copy()
vessels_filt.loc[:, "vessel_info"] = 1
data_join = data.merge(
    vessels_filt, on="mmsi", how="left", suffixes=["_ais", "_vessel"]
)
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0    619
0.0      6
Name: vessel_info, dtype: int64

Prepare attributes for later use

block_coefficients = {"bulk": 0.79, "container": 0.73, "tanker": 0.83, "LNG": 0.79}
# groups = {
#     'Dry Bulk':['Bulk carrier'],
#     'Container':['Container ship'],
#     'General Cargo':['General cargo vessel', 'Cargo ship'],
#     'Reefer':['Reefer'],
#     'Vehicle carrier':['Vehicle carrier'],
#     'Ro Ro Cargo Ship':['RoRo ship'],
#     'Animal products':['livestock carrier'],
#     'Forest':['Forest-producer carrier']
# }
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join["draught-diff"] == 0, "draught_delta"] = 0
data_join.draught_delta.value_counts() / len(data_join)
0    0.6192
1    0.3808
Name: draught_delta, dtype: float64

Only 38% have some draught delta

data_join.loc[:, "draft_ais_max"] = data_join[["draught-in", "draught-out"]].max(axis=1)
train = data_join[(data_join.vessel_info == 1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train) + len(df_missing) == len(data_join)
True
(
    train.length_vessel.isna().sum(),
    train.width_vessel.isna().sum(),
    train.draught_design.isna().sum(),
    train.dwt.isna().sum(),
)
(0, 0, 0, 0)
train.dwt.astype("int")
0      13623
1      20624
2      17728
3      35181
4      13623
       ...  
620    31760
621    31760
622    35220
623    31760
624     9389
Name: dwt, Length: 619, dtype: int64
y = list(train.dwt.astype("int"))
X = [
    [row.length_vessel, row.width_vessel, row.draught_design]
    for idx, row in train.iterrows()
]
len(y) == len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators=500, random_state=1)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL:  0.7806451612903226
R-Squared OF THE MODEL:  0.9944090737438522
X_missing = [
    [row.length_ais, row.width_ais, row.draft_ais_max]
    for idx, row in df_missing.iterrows()
]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), "dwt"] = df_missing.dwt
df = data_join.copy()

Add block coefficient category

df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max'],
      dtype='object')
df.loc[:, "block_cat"] = ""
df.loc[:, "vessel_type_best"] = df.sub_type  # df.sub_vessel_type
df.vessel_type_best.isna().sum()
177
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_vessel"
]  # vessel_type_vessel
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_sub"
]
df.vessel_type_best.isna().sum()
6
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_main"
]
df.vessel_type_best.isna().sum()
5
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_ais"
]
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains("container"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("bulk"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("tanker"), "block_cat"] = "tanker"
df.loc[df.vessel_type_best.str.lower().str.contains("lng"), "block_cat"] = "LNG"
df.loc[df.block_cat == "", "vessel_type_best"].unique()
array(['General Cargo Ship', 'Cargo', 'Vehicles Carrier',
       'Livestock Carrier'], dtype=object)
df.loc[df.vessel_type_best.str.lower().str.contains("cargo"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("roro"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("vehicle"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("forest"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("livestock"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("reefer"), "block_cat"] = (
    "container"
)
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)

Get draft differences from next

aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client("s3")
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [
    os.path.join("s3://", aws_bucket, content["Key"])
    for content in file_list["Contents"]
]
# bucket_files
df_next = pd.read_csv(
    "s3://wbgdecinternal-ntl/Andres_Temp/AIS/Latakia_next_draught.csv", index_col=0
)
len(df_next)
308
df_orig = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
df_next = df_next.join(df_orig[["turn_around_time", "date-leave"]])
df = df.merge(
    df_next[["mmsi", "draught", "heading", "date-leave", "turn_around_time"]],
    on=["mmsi", "date-leave", "turn_around_time"],
    how="left",
).copy()
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "draught-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "draught"
]
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "heading-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "heading"
]
df["draught-diff"] = df["draught-out"] - df["draught-in"]
df.loc[df["draught-diff"] != 0, "draught_delta"] = 1
df.draught_delta.value_counts()
1    482
0    143
Name: draught_delta, dtype: int64

Calculate payload

df.loc[df["draught-in"] == 0, "draught_delta"] = 0
df.loc[df["draught-out"] == 0, "draught_delta"] = 0
df.loc[df["draught-in"] == 0, "draught-diff"] = 0
df.loc[df["draught-out"] == 0, "draught-diff"] = 0
df.loc[:, "draft_max"] = df[["draught_design", "draught-in", "draught-out"]].max(axis=1)
df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max', 'block_cat', 'vessel_type_best', 'draught', 'heading',
       'draft_max', 'length', 'width', 'payload_in', 'payload_out'],
      dtype='object')
df.loc[:, "length"] = df.loc[:, "length_vessel"]
df.loc[df.length.isna(), "length"] = df.loc[df.length.isna(), "length_ais"]
df.loc[:, "width"] = df.loc[:, "width_vessel"]
df.loc[df.width.isna(), "width"] = df.loc[df.width.isna(), "width_ais"]
def calculate_payload(row, direction):
    # get parameters
    Din = row["draught-in"]
    Dout = row["draught-out"]
    if Din == Dout:
        return pd.NA
    else:
        L = row["length"]
        W = row["width"]
        Dd = row["draft_max"]
        DWT = row["dwt"]
        Cb = block_coefficients[row["block_cat"]]
        pw = 1.025  # tons/m3  1029kg/m3

        if direction == "in":
            Dr = Din
        if direction == "out":
            Dr = Dout

        # calcualte block coefficient for reported draft
        Cbr = 1 - ((1 - Cb) * ((Dr / Dd) ** (1 / 3)))

        # calcualte payload rate
        payload = ((((Cbr * Dr) - (Cb * Dd)) * (L * W * pw)) + DWT) / DWT
        return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, "in"), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, "out"), axis=1)
df.head(2)
date-leave mmsi turn_around_time datetime-leave draught-out heading-out seconds datetime-entry date-entry vessel_type_ais dtg length_ais width_ais draught-in heading-in vessel_type_main vessel_type_sub draught-diff heading-diff passing port-name country passing2 sub_type vessel_type_vessel dwt length_vessel width_vessel draught_design vessel_info draught_delta draft_ais_max block_cat vessel_type_best draught heading draft_max length width payload_in payload_out
0 2019-01-07 271044398 31.154722 2019-01-07 15:06:36 6.4 0.0 112157.0 2019-01-06 07:57:19 2019-01-06 Cargo 2019-01-06 07:39:16 152.0 24.0 8.9 0.0 Container Ship NaN -2.5 0.0 3 Latakia Syria NaN Container Ship Cargo 13623.000000 151.0 24.0 8.25 1.0 1 8.9 container Container Ship 6.4 0.0 8.9 151.0 24.0 1.0 0.551421
1 2019-01-10 271040403 10.353056 2019-01-10 19:54:11 9.3 48.0 37271.0 2019-01-10 09:33:00 2019-01-10 Cargo 2019-01-10 09:33:00 184.0 25.0 9.3 48.0 Container Ship NaN 0.0 0.0 3 Latakia Syria NaN Container Ship Cargo 20624.133195 184.0 25.0 9.40 1.0 0 9.3 container Container Ship NaN NaN 9.4 184.0 25.0 <NA> <NA>
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta < 0:
        import_val = (row.payload_in * row.dwt) - (row.payload_out * row.dwt)
        export_val = 0
    elif delta > 0:
        import_val = 0
        export_val = (row.payload_out * row.dwt) - (row.payload_in * row.dwt)
    df.loc[idx, "import_val"] = import_val
    df.loc[idx, "export_val"] = export_val
df_sub = df.loc[df.draught_delta == 1].copy()
avg_flow = df_sub[["mmsi", "import_val", "export_val"]].groupby("mmsi").mean()
avg_flow.loc[avg_flow.import_val > avg_flow.export_val, "import_avg"] = (
    avg_flow.import_val
)
avg_flow.loc[avg_flow.import_val < avg_flow.export_val, "export_avg"] = (
    avg_flow.export_val
)
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(["import_val", "export_val"], axis=1, inplace=True)
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta == 0:
        try:
            avg = avg_flow.loc[row.mmsi]
            df.loc[idx, "import_val"] = avg.import_avg  # import_val
            df.loc[idx, "export_val"] = avg.export_avg  # export_val
        except:
            df.loc[idx, "import_val"] = 0
            df.loc[idx, "export_val"] = 0
#         import_val = fr_im*row.dwt
#         export_val = fr_exp*row.dwt
len(df.loc[df.draught_delta == 0])
149
len(df.loc[(df.draught_delta == 0) & (df.import_val > 0)])
90
len(df.loc[(df.draught_delta == 0) & (df.export_val > 0)])
6
df_latakia_trade = df.copy()
# df_final = df_latakia_trade.copy()
# df_final['date-entry']
# df_final.loc[:, "ym"] =
# df_final[['port-name', 'date-entry', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum().to_csv(join(ais_dir, "preliminary_results_latakia_11.22.v3.csv"))

Run Tartus#

1.2 Data Prep#

Run port call algorithm that converts AIS messages into trips

trips = create_port_calls(df_tartus, start, end, "Tartus", "Syria")
len(trips)
307

Filter out non-trade trips

trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[
    trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"
] = "maintenance"
trips.loc[
    (trips.turn_around_time < 10) & (trips["heading-diff"].abs() < 45), "passing2"
] = "passing by"
trips.passing2.value_counts()
passing by     86
maintenance    16
refueling      11
Name: passing2, dtype: int64
data = trips.loc[trips.passing2.isna()].copy()
len(data)
194
data.vessel_type.unique()
array(['Cargo', 'Tanker', 'Other', 'Passenger', 'SAR', 'Tug',
       'UNAVAILABLE', 'Unknown', 'Towing'], dtype=object)
data.vessel_type_main.unique()
array(['Container Ship', nan, 'General Cargo Ship', 'Bulk Carrier',
       'Other Tanker', 'Specialized Cargo Ship', 'Ro Ro Cargo Ship',
       'Oil And Chemical Tanker'], dtype=object)
drop_types = ["Reserved", "Passenger", "Passenger", "Tug", "Towing", "SAR"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()
len(data)
185
data.vessel_type_sub.unique()
array([nan, 'Vegetable Oil Tanker', 'Livestock Carrier',
       'Chemical Oil Products Tanker', 'Chemical Tanker'], dtype=object)
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()
# data.loc[664]

1.3 Predict DWT#

Match vessel information from Fleetmon database, and impute missing values

data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype("int")
vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))
# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)
vessels.head(2)
mmsi dwt length_design width_design draught_design block vessel_type_AIS sub_type sub_vessel_type_AIS
0 667001408 82.0 34.5 11.008 2.1 0.67 Cargo Container Ship Container
1 351863000 90.0 30.5 9.919 3.5 0.67 Cargo Container Ship Container
vessels.rename(
    columns={
        "length_design": "length",
        "width_design": "width",
        "vessel_type_AIS": "vessel_type",
        "sub_vessel_type_AIS": "sub_vessel_type",
    },
    inplace=True,
)
vessels = vessels.loc[~(vessels.mmsi.isna())].copy()
vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype("int")
vessels_filt = vessels[
    ["sub_type", "vessel_type", "mmsi", "dwt", "length", "width", "draught_design"]
].copy()
vessels_filt.loc[:, "vessel_info"] = 1
data_join = data.merge(
    vessels_filt, on="mmsi", how="left", suffixes=["_ais", "_vessel"]
)
data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0
data_join.vessel_info.value_counts()
1.0    157
0.0     28
Name: vessel_info, dtype: int64
data_join.vessel_type_vessel.unique()
array(['Cargo', 'Tanker', nan], dtype=object)

For later use

block_coefficients = {"bulk": 0.79, "container": 0.73, "tanker": 0.83, "LNG": 0.79}
groups = {
    "Dry Bulk": ["Bulk carrier"],
    "Container": ["Container ship"],
    "General Cargo": ["General cargo vessel", "Cargo ship"],
    "Reefer": ["Reefer"],
    "Vehicle carrier": ["Vehicle carrier"],
    "Ro Ro Cargo Ship": ["RoRo ship"],
    "Animal products": ["livestock carrier"],
    "Forest": ["Forest-producer carrier"],
}
# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data_join = data_join.loc[~(data_join.vessel_type_sub.isin(drop_sub_types))].copy()
# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)
data_join.loc[:, "draught_delta"] = 1
data_join.loc[data_join["draught-diff"] == 0, "draught_delta"] = 0
data_join.draught_delta.value_counts() / len(data_join)
0    0.848649
1    0.151351
Name: draught_delta, dtype: float64
data_join.draught_delta.value_counts()
0    157
1     28
Name: draught_delta, dtype: int64
data_join.loc[:, "draft_ais_max"] = data_join[["draught-in", "draught-out"]].max(axis=1)
# data_join[data_join.dwt.isna()].iloc[0]
# data_join[data_join.draft_ais_max==0]
train = data_join[(data_join.vessel_info == 1) & (~data_join.dwt.isna())].copy()
df_missing = data_join[data_join.dwt.isna()].copy()
len(train) + len(df_missing) == len(data_join)
True
(
    train.length_vessel.isna().sum(),
    train.width_vessel.isna().sum(),
    train.draught_design.isna().sum(),
    train.dwt.isna().sum(),
)
(0, 0, 0, 0)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
y = list(train.dwt.astype("int"))
X = [
    [row.length_vessel, row.width_vessel, row.draught_design]
    for idx, row in train.iterrows()
]
len(y) == len(X)
True
# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# creating a RF classifier
clf = RandomForestClassifier(n_estimators=500, random_state=1)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))
ACCURACY OF THE MODEL:  0.7
R-Squared OF THE MODEL:  0.9772705948715836
# df_missing.loc[df_missing.length_ais==0]
X_missing = [
    [row.length_ais, row.width_ais, row.draft_ais_max]
    for idx, row in df_missing.iterrows()
]
pred_with_missing = clf.predict(X_missing)
df_missing.dwt = pred_with_missing
data_join.loc[data_join.dwt.isna(), "dwt"] = df_missing.dwt
# data_join = data_join.loc[data_join.draught_delta==1].copy()
# data_join = data_join.loc[data_join.vessel_info==1].copy()
# data_join['draught-diff'].describe()
df = data_join.copy()

Add block coefficient category

df.columns
Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max'],
      dtype='object')
df.loc[:, "block_cat"] = ""
df.loc[:, "vessel_type_best"] = df.sub_type
df.vessel_type_best.isna().sum()
80
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_vessel"
]
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_sub"
]
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_main"
]
df.vessel_type_best.isna().sum()
28
df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_ais"
]
df.vessel_type_best.isna().sum()
0
df.loc[df.vessel_type_best.str.lower().str.contains("container"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("bulk"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("tanker"), "block_cat"] = "tanker"
df.loc[df.vessel_type_best.str.lower().str.contains("lng"), "block_cat"] = "LNG"
df.loc[df.block_cat == "", "vessel_type_best"].unique()
array([], dtype=object)
# df.loc[df.block_cat=='bulk', "sub_vessel_type"].unique()
# df.loc[df.block_cat=='container', "sub_vessel_type"].unique()
# df.loc[df.vessel_type_best=='Other', ]
df.loc[df.vessel_type_best.str.lower().str.contains("cargo"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("roro"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("vehicle"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("forest"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("livestock"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("reefer"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("other"), "block_cat"] = "container"
df.block_cat.unique()
array(['container', 'tanker', 'bulk'], dtype=object)

Get draft differences from next

aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"
client = boto3.client("s3")
file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [
    os.path.join("s3://", aws_bucket, content["Key"])
    for content in file_list["Contents"]
]
# bucket_files
df_next = pd.read_csv(
    "s3://wbgdecinternal-ntl/Andres_Temp/AIS/Tartus_next_draught.csv", index_col=0
)
len(df_next)
67
df_orig = pd.read_csv(join(ais_dir, "port_calls_tartus.csv"), index_col=0)
df_orig[["turn_around_time", "date-leave"]].head(2)
turn_around_time date-leave
0 21.360278 2018-12-20
1 10.200833 2019-01-15
df_next = df_next.join(df_orig[["turn_around_time", "date-leave"]])
df = df.merge(
    df_next[["mmsi", "draught", "heading", "date-leave", "turn_around_time"]],
    on=["mmsi", "date-leave", "turn_around_time"],
    how="left",
).copy()
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "draught-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "draught"
]
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "heading-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "heading"
]
df["draught-diff"] = df["draught-out"] - df["draught-in"]
df.loc[df["draught-diff"] != 0, "draught_delta"] = 1
df.draught_delta.value_counts()
0    111
1     74
Name: draught_delta, dtype: int64
df.loc[df["draught-in"] == 0, "draught_delta"] = 0
df.loc[df["draught-out"] == 0, "draught_delta"] = 0
df.loc[df["draught-in"] == 0, "draught-diff"] = 0
df.loc[df["draught-out"] == 0, "draught-diff"] = 0
df.loc[:, "draft_max"] = df[["draught_design", "draught-in", "draught-out"]].max(axis=1)
df.loc[:, "length"] = df.loc[:, "length_vessel"]
df.loc[df.length.isna(), "length"] = df.loc[df.length.isna(), "length_ais"]
df.loc[:, "width"] = df.loc[:, "width_vessel"]
df.loc[df.width.isna(), "width"] = df.loc[df.width.isna(), "width_ais"]
def calculate_payload(row, direction):
    # get parameters
    Din = row["draught-in"]
    Dout = row["draught-out"]
    if Din == Dout:
        return pd.NA
    else:
        L = row["length"]
        W = row["width"]
        Dd = row["draft_max"]
        DWT = row["dwt"]
        Cb = block_coefficients[row["block_cat"]]
        pw = 1.025  # tons/m3  1029kg/m3

        if direction == "in":
            Dr = Din
        if direction == "out":
            Dr = Dout

        # calcualte block coefficient for reported draft
        Cbr = 1 - ((1 - Cb) * ((Dr / Dd) ** (1 / 3)))

        # calcualte payload rate
        payload = ((((Cbr * Dr) - (Cb * Dd)) * (L * W * pw)) + DWT) / DWT
        return payload
df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, "in"), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, "out"), axis=1)
# df.loc[:, ['payload_in', 'payload_out']]
df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0
for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta < 0:
        import_val = (row.payload_in * row.dwt) - (row.payload_out * row.dwt)
        export_val = 0
    elif delta > 0:
        import_val = 0
        export_val = (row.payload_out * row.dwt) - (row.payload_in * row.dwt)
    df.loc[idx, "import_val"] = import_val
    df.loc[idx, "export_val"] = export_val
# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
df_sub = df.loc[df.draught_delta == 1].copy()
avg_flow = df_sub[["mmsi", "import_val", "export_val"]].groupby("mmsi").mean()
avg_flow.loc[avg_flow.import_val > avg_flow.export_val, "import_avg"] = (
    avg_flow.import_val
)
avg_flow.loc[avg_flow.import_val < avg_flow.export_val, "export_avg"] = (
    avg_flow.export_val
)
avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(["import_val", "export_val"], axis=1, inplace=True)
len(avg_flow)
26
for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta == 0:
        try:
            avg = avg_flow.loc[row.mmsi]
            df.loc[idx, "import_val"] = avg.import_avg  # import_val
            df.loc[idx, "export_val"] = avg.export_avg  # export_val
        except:
            df.loc[idx, "import_val"] = 0
            df.loc[idx, "export_val"] = 0
len(df.loc[df.draught_delta == 0])
115
len(df.loc[(df.draught_delta == 0) & (df.import_val > 0)])
22
len(df.loc[(df.draught_delta == 0) & (df.export_val > 0)])
3
# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)
# df.loc[:, "export_val"] = 0
# df.loc[:, "import_val"] = 0
# for idx, row in df.iterrows():
#     delta = row['draught-diff']
#     if delta < 0:
#         import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
#         export_val = 0
#     elif delta > 0:
#         import_val = 0
#         export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
#     elif delta == 0:
#         if fr_im>fr_exp:
#             fr_import = fr_im-fr_exp
#             fr_export = 0
#         elif fr_im<fr_exp:
#             fr_export = fr_exp-fr_im
#             fr_import = 0
#         import_val = fr_import*row.dwt
#         export_val = fr_export*row.dwt
#     df.loc[idx, 'import_val'] = import_val
#     df.loc[idx, 'export_val'] = export_val
df_tartus_trade = df.copy()

Asseble both

df_final = pd.concat([df_latakia_trade, df_tartus_trade])
data_dir
'/home/wb514197/data/AIS/Syria'
df_final["date"] = pd.to_datetime(df_final["date-entry"])
df_final["ym"] = df_final["date"].dt.strftime("%Y-%m")
len(df_final)
810
len(df_final.loc[(df_final.export_val == 0) & (df_final.import_val == 0)])
143
df_final = df_final.loc[
    ~((df_final.export_val == 0) & (df_final.import_val == 0))
].copy()
df_final = (
    df_final[["port-name", "date-entry", "ym", "export_val", "import_val"]]
    .groupby(["port-name", "date-entry"])
    .sum()
)
df_final.reset_index(inplace=True)
df_final["date-entry"] = pd.to_datetime(df_final["date-entry"])
df_final["ym"] = df_final["date-entry"].dt.strftime("%Y-%m")
df_final.to_csv(join(ais_dir, "final_results_12.13.csv"))