AIS Trade Estimation

AIS Trade Estimation#

This notebook implements the AIS-derived trade estimation methodology from: Global economic impacts of COVID-19 lockdown measures stand out in high-frequency shipping data, Vesrchuur et al 2021

0.1 Load libraries

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

import os
from os.path import join
from glob import glob
import pandas as pd

import numpy as np
from port_call import create_port_calls
import boto3

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

pd.options.display.max_columns = None

0.2 Load AIS data

AIS data has been accessed through the UN Big Data Platform.

We have retrieved all AIS messages that intersect a 20 km. buffer from each port in Syria (Al Ladhiqiyah, Tartus, Baniyas), available from December 1st 2018 to August 31st 2022.

ais_dir = join(os.path.expanduser("~"), "data", "AIS")
data_dir = join(ais_dir, "Syria")

data_files = glob(data_dir + "/*.csv")

dfs = [pd.read_csv(f, index_col=0) for f in data_files]

df = pd.concat(dfs)

df.polygon_name.unique()

array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)

df_latakia = df.loc[df.polygon_name == "AL LADHIQIYAH"].copy()
df_tartus = df.loc[df.polygon_name == "TARTUS"].copy()
df_baniyas = df.loc[df.polygon_name == "BANIYAS"].copy()

start = "2018-12-01"
end = "2022-08-31"
country = "Syria"

1.2 Data Preparation#

Run port call algorithm that converts AIS messages into trips.

The port call algorithm sorts AIS data by MMSI (unique identified per ship) and date/time of AIS message. It then captures days with consecutive AIS messages and groups them into a single trip, calculating new attributes for each trip: turnaround-time (total time spent at port between arrival and departure), difference in reported draft, and difference in direction of travel.

trips = create_port_calls(df_latakia, start, end, "Latakia", "Syria")

len(trips)

Filter out trips that are not related to trade

Only cargo and tanker vessels
Turnaround time < 5 hours or > 95th percentile (refueling or maintenance)
Turnaround time < 10 hours and direction within 45 degrees (passing by)
Vessel types or subtypes that do not relate to trade

trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[
    trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"
] = "maintenance"
trips.loc[
    (trips.turn_around_time < 10) & (trips["heading-diff"].abs() < 45), "passing2"
] = "passing by"

trips.passing2.value_counts()

passing by     98
maintenance    40
refueling      20
Name: passing2, dtype: int64

data = trips.loc[trips.passing2.isna()].copy()

len(data)

data.vessel_type.unique()

array(['Cargo', 'UNAVAILABLE', 'Passenger', 'Unknown', 'Reserved'],
      dtype=object)

data.vessel_type_main.unique()

array(['Container Ship', 'General Cargo Ship', nan,
       'Oil And Chemical Tanker', 'Bulk Carrier', 'Ro Ro Cargo Ship',
       'Specialized Cargo Ship', 'Offshore Vessel'], dtype=object)

data.loc[data.vessel_type == "Reserved", "vessel_type_main"].unique()

array([nan], dtype=object)

data.loc[data.vessel_type == "Passenger", "vessel_type_main"].unique()

array(['Ro Ro Cargo Ship', nan], dtype=object)

drop_types = ["Reserved", "Passenger"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()

len(data)

data.vessel_type_sub.unique()

array([nan, 'Oil Products Tanker', 'Vehicles Carrier',
       'Livestock Carrier', 'Offshore Tug Supply Ship'], dtype=object)

drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()

len(data)

1.3 Predict DWT#

Match vessel information from Fleetmon database, and impute missing values

data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype("int")

vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))

# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)

vessels.head(2)

	mmsi	dwt	length_design	width_design	draught_design	block	vessel_type_AIS	sub_type	sub_vessel_type_AIS
0	667001408	82.0	34.5	11.008	2.1	0.67	Cargo	Container Ship	Container
1	351863000	90.0	30.5	9.919	3.5	0.67	Cargo	Container Ship	Container

vessels.rename(
    columns={
        "length_design": "length",
        "width_design": "width",
        "vessel_type_AIS": "vessel_type",
        "sub_vessel_type_AIS": "sub_vessel_type",
    },
    inplace=True,
)

vessels = vessels.loc[~(vessels.mmsi.isna())].copy()

vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype("int")

vessels_filt = vessels[
    ["sub_type", "vessel_type", "mmsi", "dwt", "length", "width", "draught_design"]
].copy()

# vessels_filt = vessels[['sub_vessel_type', 'vessel_type', 'mmsi', 'dwt', 'gt', 'length', 'width', 'Draft', 'flag']].copy()

vessels_filt.loc[:, "vessel_info"] = 1

data_join = data.merge(
    vessels_filt, on="mmsi", how="left", suffixes=["_ais", "_vessel"]
)

data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0

data_join.vessel_info.value_counts()

1.0    619
0.0      6
Name: vessel_info, dtype: int64

Prepare attributes for later use

block_coefficients = {"bulk": 0.79, "container": 0.73, "tanker": 0.83, "LNG": 0.79}
# groups = {
#     'Dry Bulk':['Bulk carrier'],
#     'Container':['Container ship'],
#     'General Cargo':['General cargo vessel', 'Cargo ship'],
#     'Reefer':['Reefer'],
#     'Vehicle carrier':['Vehicle carrier'],
#     'Ro Ro Cargo Ship':['RoRo ship'],
#     'Animal products':['livestock carrier'],
#     'Forest':['Forest-producer carrier']
# }

# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)

data_join.loc[:, "draught_delta"] = 1

data_join.loc[data_join["draught-diff"] == 0, "draught_delta"] = 0

data_join.draught_delta.value_counts() / len(data_join)

0    0.6192
1    0.3808
Name: draught_delta, dtype: float64

Only 38% have some draught delta

data_join.loc[:, "draft_ais_max"] = data_join[["draught-in", "draught-out"]].max(axis=1)

train = data_join[(data_join.vessel_info == 1) & (~data_join.dwt.isna())].copy()

df_missing = data_join[data_join.dwt.isna()].copy()

len(train) + len(df_missing) == len(data_join)

True

(
    train.length_vessel.isna().sum(),
    train.width_vessel.isna().sum(),
    train.draught_design.isna().sum(),
    train.dwt.isna().sum(),
)

(0, 0, 0, 0)

train.dwt.astype("int")

    13623
    20624
    17728
    35181
    13623
       ...  
  31760
  31760
  35220
  31760
   9389
Name: dwt, Length: 619, dtype: int64

y = list(train.dwt.astype("int"))
X = [
    [row.length_vessel, row.width_vessel, row.draught_design]
    for idx, row in train.iterrows()
]
len(y) == len(X)

True

# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# creating a RF classifier
clf = RandomForestClassifier(n_estimators=500, random_state=1)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.7806451612903226
R-Squared OF THE MODEL:  0.9944090737438522

X_missing = [
    [row.length_ais, row.width_ais, row.draft_ais_max]
    for idx, row in df_missing.iterrows()
]

pred_with_missing = clf.predict(X_missing)

df_missing.dwt = pred_with_missing

data_join.loc[data_join.dwt.isna(), "dwt"] = df_missing.dwt

df = data_join.copy()

Add block coefficient category

df.columns

Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max'],
      dtype='object')

df.loc[:, "block_cat"] = ""

df.loc[:, "vessel_type_best"] = df.sub_type  # df.sub_vessel_type
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_vessel"
]  # vessel_type_vessel
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_sub"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_main"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_ais"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.str.lower().str.contains("container"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("bulk"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("tanker"), "block_cat"] = "tanker"
df.loc[df.vessel_type_best.str.lower().str.contains("lng"), "block_cat"] = "LNG"

df.loc[df.block_cat == "", "vessel_type_best"].unique()

array(['General Cargo Ship', 'Cargo', 'Vehicles Carrier',
       'Livestock Carrier'], dtype=object)

df.loc[df.vessel_type_best.str.lower().str.contains("cargo"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("roro"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("vehicle"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("forest"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("livestock"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("reefer"), "block_cat"] = (
    "container"
)

df.block_cat.unique()

array(['container', 'tanker', 'bulk'], dtype=object)

Get draft differences from next

aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"

client = boto3.client("s3")

file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [
    os.path.join("s3://", aws_bucket, content["Key"])
    for content in file_list["Contents"]
]
# bucket_files

df_next = pd.read_csv(
    "s3://wbgdecinternal-ntl/Andres_Temp/AIS/Latakia_next_draught.csv", index_col=0
)

len(df_next)

df_orig = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)

df_next = df_next.join(df_orig[["turn_around_time", "date-leave"]])

df = df.merge(
    df_next[["mmsi", "draught", "heading", "date-leave", "turn_around_time"]],
    on=["mmsi", "date-leave", "turn_around_time"],
    how="left",
).copy()

df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "draught-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "draught"
]
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "heading-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "heading"
]

df["draught-diff"] = df["draught-out"] - df["draught-in"]

df.loc[df["draught-diff"] != 0, "draught_delta"] = 1

df.draught_delta.value_counts()

1    482
0    143
Name: draught_delta, dtype: int64

Calculate payload

df.loc[df["draught-in"] == 0, "draught_delta"] = 0
df.loc[df["draught-out"] == 0, "draught_delta"] = 0
df.loc[df["draught-in"] == 0, "draught-diff"] = 0
df.loc[df["draught-out"] == 0, "draught-diff"] = 0

df.loc[:, "draft_max"] = df[["draught_design", "draught-in", "draught-out"]].max(axis=1)

df.columns

Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max', 'block_cat', 'vessel_type_best', 'draught', 'heading',
       'draft_max', 'length', 'width', 'payload_in', 'payload_out'],
      dtype='object')

df.loc[:, "length"] = df.loc[:, "length_vessel"]
df.loc[df.length.isna(), "length"] = df.loc[df.length.isna(), "length_ais"]

df.loc[:, "width"] = df.loc[:, "width_vessel"]
df.loc[df.width.isna(), "width"] = df.loc[df.width.isna(), "width_ais"]

def calculate_payload(row, direction):
    # get parameters
    Din = row["draught-in"]
    Dout = row["draught-out"]
    if Din == Dout:
        return pd.NA
    else:
        L = row["length"]
        W = row["width"]
        Dd = row["draft_max"]
        DWT = row["dwt"]
        Cb = block_coefficients[row["block_cat"]]
        pw = 1.025  # tons/m3  1029kg/m3

        if direction == "in":
            Dr = Din
        if direction == "out":
            Dr = Dout

        # calcualte block coefficient for reported draft
        Cbr = 1 - ((1 - Cb) * ((Dr / Dd) ** (1 / 3)))

        # calcualte payload rate
        payload = ((((Cbr * Dr) - (Cb * Dd)) * (L * W * pw)) + DWT) / DWT
        return payload

df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, "in"), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, "out"), axis=1)

df.head(2)

	date-leave	mmsi	turn_around_time	datetime-leave	draught-out	heading-out	seconds	datetime-entry	date-entry	vessel_type_ais	dtg	length_ais	width_ais	draught-in	heading-in	vessel_type_main	vessel_type_sub	draught-diff	heading-diff	passing	port-name	country	passing2	sub_type	vessel_type_vessel	dwt	length_vessel	width_vessel	draught_design	vessel_info	draught_delta	draft_ais_max	block_cat	vessel_type_best	draught	heading	draft_max	length	width	payload_in	payload_out
0	2019-01-07	271044398	31.154722	2019-01-07 15:06:36	6.4	0.0	112157.0	2019-01-06 07:57:19	2019-01-06	Cargo	2019-01-06 07:39:16	152.0	24.0	8.9	0.0	Container Ship	NaN	-2.5	0.0	3	Latakia	Syria	NaN	Container Ship	Cargo	13623.000000	151.0	24.0	8.25	1.0	1	8.9	container	Container Ship	6.4	0.0	8.9	151.0	24.0	1.0	0.551421
1	2019-01-10	271040403	10.353056	2019-01-10 19:54:11	9.3	48.0	37271.0	2019-01-10 09:33:00	2019-01-10	Cargo	2019-01-10 09:33:00	184.0	25.0	9.3	48.0	Container Ship	NaN	0.0	0.0	3	Latakia	Syria	NaN	Container Ship	Cargo	20624.133195	184.0	25.0	9.40	1.0	0	9.3	container	Container Ship	NaN	NaN	9.4	184.0	25.0	<NA>	<NA>

# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)

df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0

for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta < 0:
        import_val = (row.payload_in * row.dwt) - (row.payload_out * row.dwt)
        export_val = 0
    elif delta > 0:
        import_val = 0
        export_val = (row.payload_out * row.dwt) - (row.payload_in * row.dwt)
    df.loc[idx, "import_val"] = import_val
    df.loc[idx, "export_val"] = export_val

df_sub = df.loc[df.draught_delta == 1].copy()

avg_flow = df_sub[["mmsi", "import_val", "export_val"]].groupby("mmsi").mean()

avg_flow.loc[avg_flow.import_val > avg_flow.export_val, "import_avg"] = (
    avg_flow.import_val
)
avg_flow.loc[avg_flow.import_val < avg_flow.export_val, "export_avg"] = (
    avg_flow.export_val
)

avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(["import_val", "export_val"], axis=1, inplace=True)

# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)

for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta == 0:
        try:
            avg = avg_flow.loc[row.mmsi]
            df.loc[idx, "import_val"] = avg.import_avg  # import_val
            df.loc[idx, "export_val"] = avg.export_avg  # export_val
        except:
            df.loc[idx, "import_val"] = 0
            df.loc[idx, "export_val"] = 0
#         import_val = fr_im*row.dwt
#         export_val = fr_exp*row.dwt

len(df.loc[df.draught_delta == 0])

len(df.loc[(df.draught_delta == 0) & (df.import_val > 0)])

len(df.loc[(df.draught_delta == 0) & (df.export_val > 0)])

df_latakia_trade = df.copy()

# df_final = df_latakia_trade.copy()

# df_final['date-entry']

# df_final.loc[:, "ym"] =

# df_final[['port-name', 'date-entry', 'export_val', 'import_val']].groupby(['port-name', 'date-entry']).sum().to_csv(join(ais_dir, "preliminary_results_latakia_11.22.v3.csv"))

Run Tartus#

1.2 Data Prep#

Run port call algorithm that converts AIS messages into trips

trips = create_port_calls(df_tartus, start, end, "Tartus", "Syria")

len(trips)

Filter out non-trade trips

trips.loc[trips.turn_around_time < 5, "passing2"] = "refueling"
trips.loc[
    trips.turn_around_time > np.percentile(trips.turn_around_time, 95), "passing2"
] = "maintenance"
trips.loc[
    (trips.turn_around_time < 10) & (trips["heading-diff"].abs() < 45), "passing2"
] = "passing by"

trips.passing2.value_counts()

passing by     86
maintenance    16
refueling      11
Name: passing2, dtype: int64

data = trips.loc[trips.passing2.isna()].copy()

len(data)

data.vessel_type.unique()

array(['Cargo', 'Tanker', 'Other', 'Passenger', 'SAR', 'Tug',
       'UNAVAILABLE', 'Unknown', 'Towing'], dtype=object)

data.vessel_type_main.unique()

array(['Container Ship', nan, 'General Cargo Ship', 'Bulk Carrier',
       'Other Tanker', 'Specialized Cargo Ship', 'Ro Ro Cargo Ship',
       'Oil And Chemical Tanker'], dtype=object)

drop_types = ["Reserved", "Passenger", "Passenger", "Tug", "Towing", "SAR"]
data = data.loc[~(data.vessel_type.isin(drop_types))].copy()

len(data)

data.vessel_type_sub.unique()

array([nan, 'Vegetable Oil Tanker', 'Livestock Carrier',
       'Chemical Oil Products Tanker', 'Chemical Tanker'], dtype=object)

# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data = data.loc[~(data.vessel_type_sub.isin(drop_sub_types))].copy()

# data.loc[664]

1.3 Predict DWT#

Match vessel information from Fleetmon database, and impute missing values

data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype("int")

vessels = pd.read_csv(join(ais_dir, "vessel_database_full.csv"))

# vessels = pd.read_excel(join(ais_dir, "Pacific_vessel_database_2021_new.xlsx"), 0)

vessels.head(2)

	mmsi	dwt	length_design	width_design	draught_design	block	vessel_type_AIS	sub_type	sub_vessel_type_AIS
0	667001408	82.0	34.5	11.008	2.1	0.67	Cargo	Container Ship	Container
1	351863000	90.0	30.5	9.919	3.5	0.67	Cargo	Container Ship	Container

vessels.rename(
    columns={
        "length_design": "length",
        "width_design": "width",
        "vessel_type_AIS": "vessel_type",
        "sub_vessel_type_AIS": "sub_vessel_type",
    },
    inplace=True,
)

vessels = vessels.loc[~(vessels.mmsi.isna())].copy()

vessels.loc[:, "mmsi"] = vessels.loc[:, "mmsi"].astype("int")

vessels_filt = vessels[
    ["sub_type", "vessel_type", "mmsi", "dwt", "length", "width", "draught_design"]
].copy()

vessels_filt.loc[:, "vessel_info"] = 1

data_join = data.merge(
    vessels_filt, on="mmsi", how="left", suffixes=["_ais", "_vessel"]
)

data_join.loc[data_join.vessel_info.isna(), "vessel_info"] = 0

data_join.vessel_info.value_counts()

1.0    157
0.0     28
Name: vessel_info, dtype: int64

data_join.vessel_type_vessel.unique()

array(['Cargo', 'Tanker', nan], dtype=object)

For later use

block_coefficients = {"bulk": 0.79, "container": 0.73, "tanker": 0.83, "LNG": 0.79}
groups = {
    "Dry Bulk": ["Bulk carrier"],
    "Container": ["Container ship"],
    "General Cargo": ["General cargo vessel", "Cargo ship"],
    "Reefer": ["Reefer"],
    "Vehicle carrier": ["Vehicle carrier"],
    "Ro Ro Cargo Ship": ["RoRo ship"],
    "Animal products": ["livestock carrier"],
    "Forest": ["Forest-producer carrier"],
}

# drop_sub_types = ["Offshore Tug Supply Ship", "Crewboat"]
# data_join = data_join.loc[~(data_join.vessel_type_sub.isin(drop_sub_types))].copy()

# data_join.to_csv(join(ais_dir, "port_calls_latakia.csv"))
# data_join = pd.read_csv(join(ais_dir, "port_calls_latakia.csv"), index_col=0)

data_join.loc[:, "draught_delta"] = 1

data_join.loc[data_join["draught-diff"] == 0, "draught_delta"] = 0

data_join.draught_delta.value_counts() / len(data_join)

0    0.848649
1    0.151351
Name: draught_delta, dtype: float64

data_join.draught_delta.value_counts()

0    157
1     28
Name: draught_delta, dtype: int64

data_join.loc[:, "draft_ais_max"] = data_join[["draught-in", "draught-out"]].max(axis=1)

# data_join[data_join.dwt.isna()].iloc[0]
# data_join[data_join.draft_ais_max==0]

train = data_join[(data_join.vessel_info == 1) & (~data_join.dwt.isna())].copy()

df_missing = data_join[data_join.dwt.isna()].copy()

len(train) + len(df_missing) == len(data_join)

True

(
    train.length_vessel.isna().sum(),
    train.width_vessel.isna().sum(),
    train.draught_design.isna().sum(),
    train.dwt.isna().sum(),
)

(0, 0, 0, 0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

y = list(train.dwt.astype("int"))
X = [
    [row.length_vessel, row.width_vessel, row.draught_design]
    for idx, row in train.iterrows()
]
len(y) == len(X)

True

# Spliting arrays or matrices into random train and test subsets
# i.e. 70 % training dataset and 25 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# creating a RF classifier
clf = RandomForestClassifier(n_estimators=500, random_state=1)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(X_test)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
print("R-Squared OF THE MODEL: ", metrics.r2_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.7
R-Squared OF THE MODEL:  0.9772705948715836

# df_missing.loc[df_missing.length_ais==0]

X_missing = [
    [row.length_ais, row.width_ais, row.draft_ais_max]
    for idx, row in df_missing.iterrows()
]

pred_with_missing = clf.predict(X_missing)

df_missing.dwt = pred_with_missing

data_join.loc[data_join.dwt.isna(), "dwt"] = df_missing.dwt

# data_join = data_join.loc[data_join.draught_delta==1].copy()
# data_join = data_join.loc[data_join.vessel_info==1].copy()

# data_join['draught-diff'].describe()

df = data_join.copy()

Add block coefficient category

df.columns

Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type_ais', 'dtg', 'length_ais', 'width_ais', 'draught-in',
       'heading-in', 'vessel_type_main', 'vessel_type_sub', 'draught-diff',
       'heading-diff', 'passing', 'port-name', 'country', 'passing2',
       'sub_type', 'vessel_type_vessel', 'dwt', 'length_vessel',
       'width_vessel', 'draught_design', 'vessel_info', 'draught_delta',
       'draft_ais_max'],
      dtype='object')

df.loc[:, "block_cat"] = ""

df.loc[:, "vessel_type_best"] = df.sub_type
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_vessel"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_sub"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_main"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.isna(), "vessel_type_best"] = df.loc[
    df.vessel_type_best.isna(), "vessel_type_ais"
]
df.vessel_type_best.isna().sum()

df.loc[df.vessel_type_best.str.lower().str.contains("container"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("bulk"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("tanker"), "block_cat"] = "tanker"
df.loc[df.vessel_type_best.str.lower().str.contains("lng"), "block_cat"] = "LNG"

df.loc[df.block_cat == "", "vessel_type_best"].unique()

array([], dtype=object)

# df.loc[df.block_cat=='bulk', "sub_vessel_type"].unique()
# df.loc[df.block_cat=='container', "sub_vessel_type"].unique()

# df.loc[df.vessel_type_best=='Other', ]

df.loc[df.vessel_type_best.str.lower().str.contains("cargo"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("roro"), "block_cat"] = "container"
df.loc[df.vessel_type_best.str.lower().str.contains("vehicle"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("forest"), "block_cat"] = "bulk"
df.loc[df.vessel_type_best.str.lower().str.contains("livestock"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("reefer"), "block_cat"] = (
    "container"
)
df.loc[df.vessel_type_best.str.lower().str.contains("other"), "block_cat"] = "container"

df.block_cat.unique()

array(['container', 'tanker', 'bulk'], dtype=object)

Get draft differences from next

aws_bucket = "wbgdecinternal-ntl"
path = "Andres_Temp/AIS"

client = boto3.client("s3")

file_list = client.list_objects_v2(Bucket=aws_bucket, Prefix=path, MaxKeys=5000)
bucket_files = [
    os.path.join("s3://", aws_bucket, content["Key"])
    for content in file_list["Contents"]
]
# bucket_files

df_next = pd.read_csv(
    "s3://wbgdecinternal-ntl/Andres_Temp/AIS/Tartus_next_draught.csv", index_col=0
)

len(df_next)

df_orig = pd.read_csv(join(ais_dir, "port_calls_tartus.csv"), index_col=0)

df_orig[["turn_around_time", "date-leave"]].head(2)

	turn_around_time	date-leave
0	21.360278	2018-12-20
1	10.200833	2019-01-15

df_next = df_next.join(df_orig[["turn_around_time", "date-leave"]])

df = df.merge(
    df_next[["mmsi", "draught", "heading", "date-leave", "turn_around_time"]],
    on=["mmsi", "date-leave", "turn_around_time"],
    how="left",
).copy()

df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "draught-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "draught"
]
df.loc[(df.draught_delta == 0) & (~df.draught.isna()), "heading-out"] = df.loc[
    (df.draught_delta == 0) & (~df.draught.isna()), "heading"
]

df["draught-diff"] = df["draught-out"] - df["draught-in"]

df.loc[df["draught-diff"] != 0, "draught_delta"] = 1

df.draught_delta.value_counts()

0    111
1     74
Name: draught_delta, dtype: int64

df.loc[df["draught-in"] == 0, "draught_delta"] = 0
df.loc[df["draught-out"] == 0, "draught_delta"] = 0
df.loc[df["draught-in"] == 0, "draught-diff"] = 0
df.loc[df["draught-out"] == 0, "draught-diff"] = 0

df.loc[:, "draft_max"] = df[["draught_design", "draught-in", "draught-out"]].max(axis=1)

df.loc[:, "length"] = df.loc[:, "length_vessel"]
df.loc[df.length.isna(), "length"] = df.loc[df.length.isna(), "length_ais"]

df.loc[:, "width"] = df.loc[:, "width_vessel"]
df.loc[df.width.isna(), "width"] = df.loc[df.width.isna(), "width_ais"]

def calculate_payload(row, direction):
    # get parameters
    Din = row["draught-in"]
    Dout = row["draught-out"]
    if Din == Dout:
        return pd.NA
    else:
        L = row["length"]
        W = row["width"]
        Dd = row["draft_max"]
        DWT = row["dwt"]
        Cb = block_coefficients[row["block_cat"]]
        pw = 1.025  # tons/m3  1029kg/m3

        if direction == "in":
            Dr = Din
        if direction == "out":
            Dr = Dout

        # calcualte block coefficient for reported draft
        Cbr = 1 - ((1 - Cb) * ((Dr / Dd) ** (1 / 3)))

        # calcualte payload rate
        payload = ((((Cbr * Dr) - (Cb * Dd)) * (L * W * pw)) + DWT) / DWT
        return payload

df.loc[:, "payload_in"] = df.apply(lambda x: calculate_payload(x, "in"), axis=1)
df.loc[:, "payload_out"] = df.apply(lambda x: calculate_payload(x, "out"), axis=1)

# df.loc[:, ['payload_in', 'payload_out']]

df.loc[:, "export_val"] = 0
df.loc[:, "import_val"] = 0

for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta < 0:
        import_val = (row.payload_in * row.dwt) - (row.payload_out * row.dwt)
        export_val = 0
    elif delta > 0:
        import_val = 0
        export_val = (row.payload_out * row.dwt) - (row.payload_in * row.dwt)
    df.loc[idx, "import_val"] = import_val
    df.loc[idx, "export_val"] = export_val

# trade_in = df.loc[df['draught-diff']<0, "import_val"].sum()
# trade_out = df.loc[df['draught-diff']>0, "export_val"].sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)

df_sub = df.loc[df.draught_delta == 1].copy()

avg_flow = df_sub[["mmsi", "import_val", "export_val"]].groupby("mmsi").mean()

avg_flow.loc[avg_flow.import_val > avg_flow.export_val, "import_avg"] = (
    avg_flow.import_val
)
avg_flow.loc[avg_flow.import_val < avg_flow.export_val, "export_avg"] = (
    avg_flow.export_val
)

avg_flow.fillna(0, inplace=True)
# avg_flow.reset_index(inplace=True)
avg_flow.drop(["import_val", "export_val"], axis=1, inplace=True)

len(avg_flow)

for idx, row in df.iterrows():
    delta = row["draught-diff"]
    if delta == 0:
        try:
            avg = avg_flow.loc[row.mmsi]
            df.loc[idx, "import_val"] = avg.import_avg  # import_val
            df.loc[idx, "export_val"] = avg.export_avg  # export_val
        except:
            df.loc[idx, "import_val"] = 0
            df.loc[idx, "export_val"] = 0

len(df.loc[df.draught_delta == 0])

len(df.loc[(df.draught_delta == 0) & (df.import_val > 0)])

len(df.loc[(df.draught_delta == 0) & (df.export_val > 0)])

# trade_in = (df.payload_in*df.dwt).sum()
# trade_out = (df.payload_out*df.dwt).sum()
# trade_tot = trade_in+trade_out
# fr_im = trade_in/trade_tot
# fr_exp = trade_out/trade_tot
# (fr_im, fr_exp)

# df.loc[:, "export_val"] = 0
# df.loc[:, "import_val"] = 0

# for idx, row in df.iterrows():
#     delta = row['draught-diff']
#     if delta < 0:
#         import_val = (row.payload_in*row.dwt) - (row.payload_out*row.dwt)
#         export_val = 0
#     elif delta > 0:
#         import_val = 0
#         export_val = (row.payload_out*row.dwt) - (row.payload_in*row.dwt)
#     elif delta == 0:
#         if fr_im>fr_exp:
#             fr_import = fr_im-fr_exp
#             fr_export = 0
#         elif fr_im<fr_exp:
#             fr_export = fr_exp-fr_im
#             fr_import = 0
#         import_val = fr_import*row.dwt
#         export_val = fr_export*row.dwt
#     df.loc[idx, 'import_val'] = import_val
#     df.loc[idx, 'export_val'] = export_val

df_tartus_trade = df.copy()

Asseble both

df_final = pd.concat([df_latakia_trade, df_tartus_trade])

data_dir

'/home/wb514197/data/AIS/Syria'

df_final["date"] = pd.to_datetime(df_final["date-entry"])
df_final["ym"] = df_final["date"].dt.strftime("%Y-%m")

len(df_final)

len(df_final.loc[(df_final.export_val == 0) & (df_final.import_val == 0)])

df_final = df_final.loc[
    ~((df_final.export_val == 0) & (df_final.import_val == 0))
].copy()

df_final = (
    df_final[["port-name", "date-entry", "ym", "export_val", "import_val"]]
    .groupby(["port-name", "date-entry"])
    .sum()
)

df_final.reset_index(inplace=True)

df_final["date-entry"] = pd.to_datetime(df_final["date-entry"])
df_final["ym"] = df_final["date-entry"].dt.strftime("%Y-%m")
df_final.to_csv(join(ais_dir, "final_results_12.13.csv"))