#!/bin/bash

aembit_raw_output_chain_name=aembit_raw_output_chain
aembit_nat_output_chain_name=aembit_nat_output_chain
aembit_nat_prerouting_chain_name=aembit_nat_prerouting_chain

aembit_group_id=$(id --group aembit_agent_proxy)
agent_proxy_port=38080
agent_dns_port=8053
container_cidr=$AEMBIT_DOCKER_CONTAINER_CIDR
# https://learn.microsoft.com/en-us/azure/virtual-network/what-is-ip-address-168-63-129-16
azure_platform_resources_ip=168.63.129.16

update-alternatives --set iptables /usr/sbin/iptables-legacy
update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy

# ----- aembit_nat_output_chain ------

# Create chain
iptables -t raw -N "${aembit_raw_output_chain_name}"

# We encountered a bug where egress traffic from the Agent Proxy's DNS proxy will
# be routed back to itself. This rule prevents this bug.
# This was happening for the following reasons:
#
# 1. The Client Workload and the Agent Proxy operate on shared networking infrastructure.
#    This means that they have the same IP address.
# 2. When the Client Workload initially connects, the DNS NAT rule in this file would
#    overwrite the destination IP and port to point at the Agent Proxy.
# 3. After an iptables rule is evaluated, it is saved as a "connection" by conntrack.
#    It is not evaluated again, unless the connection in conntrack expires.
#    This means that the connection is saved in conntrack as:
#        - Original direction: <CW/AP IP>:<EPHEMERAL PORT X> -> <EXTERNAL DNS IP>:53
#        - Reply direction: 127.0.0.1:8053 -> <CW/AP IP>:<EPHEMERAL PORT X>
# 4. Since the Agent Proxy requests an ephemeral port on the host to handle the
#    outgoing DNS request, sometimes it is assigned a port that was previously
#    used and released by the Client Workload. However, since the NAT rule was already
#    evaluated, the mapping from step #3 was already saved, meaning that the outbound
#    DNS is sent to the Agent Proxy, forming a loop.
#
# To avoid this loop, in this line we set the NOTRACK target on outgoing DNS packets on
# the "raw" table, which is evaluated prior to conntrack. This causes conntrack to not
# try to associate the packet with any connection, even if one already exists.
#
# See https://www.frozentux.net/iptables-tutorial/iptables-tutorial.html#STATEMACHINE
# for more information on conntrack.
iptables -t raw -A "${aembit_raw_output_chain_name}" -p udp --dport 53 -m owner --gid-owner "${aembit_group_id}" -j NOTRACK

# The story for this rule is extremely similar to the above (only occurring in the VM environment).
#
# 1. CW used an ephemeral port to communicate with ResolveD (running on 127.0.0.53:53), and it's added to conntrack
# 2. CW releases this port, and it becomes immediately available
# 3. AP was accidentally given this port by the system when it tried to do DNS resolution as part of connecting to
# a server workload.
# 4. AP request to 127.0.0.53:53 is NOTRACK (per the rule above)
# 5. ResolveD (127.0.0.53:53) response to AP hits a match in conntrack and NAT (for some reason, decides to modify
# resolved port), and this packet is unreachable.
#
# As a result, we are adding this rule to make sure that traffic from ResolveD is not NATed.
iptables -t raw -A "${aembit_raw_output_chain_name}" -p udp -s 127.0.0.53 --sport 53 -j NOTRACK

# Add chain
iptables -t raw -A OUTPUT -j "${aembit_raw_output_chain_name}"

# ----- aembit_nat_output_chain ------

# Create chain
iptables -t nat -N "${aembit_nat_output_chain_name}"

# Ignore (return to the main chain) all packets from our agent proxy
iptables -t nat -A "${aembit_nat_output_chain_name}" -m owner --gid-owner "${aembit_group_id}" -j RETURN

# Ignore TCP loopback
iptables -t nat -A "${aembit_nat_output_chain_name}" -p tcp -o lo -j RETURN

# Redirect all new TCP connections to the Agent Proxy.
# We use the "--syn" flag to forward new connections only, since,
# prior to adding NAT rules, conntrack is not loaded. Once we add NAT rules,
# conntrack is loaded, but mistakenly identifies existing connections as new
# connections, redirecting them erroneously to the Agent Proxy mid-connection.
iptables -t nat -A "${aembit_nat_output_chain_name}" -p tcp --syn -j REDIRECT --to-port "${agent_proxy_port}"

# Redirect DNS traffic to agent proxy DNS port, except for outbound traffic from systemd-resolved.
resolve_d_user_id=$(id --user systemd-resolve);
if [ -n "${resolve_d_user_id}" ]; then
    iptables -t nat -A "${aembit_nat_output_chain_name}" -p udp --dport 53 -m owner --uid-owner "${resolve_d_user_id}" -j RETURN
fi
iptables -t nat -A "${aembit_nat_output_chain_name}" -p udp --dport 53 -j REDIRECT --to "${agent_dns_port}"

# Add chain
iptables -t nat -A OUTPUT -j "${aembit_nat_output_chain_name}"

# ----- aembit_nat_prerouting_chain ------

# Create chain
iptables -t nat -N "${aembit_nat_prerouting_chain_name}"

# Container traffic hits the NAT table's PREROUTING chain and can be redirected from there to the Agent Proxy.
# If the containers CIDR block was specified, we create a chain and accompanying rule to do that.
if [ -n "$container_cidr" ]; then
    iptables -t nat -A "${aembit_nat_prerouting_chain_name}" -p tcp --syn -s $container_cidr -j REDIRECT --to-port $agent_proxy_port
fi

# Add chain
iptables -t nat -A PREROUTING -j "${aembit_nat_prerouting_chain_name}"

# ------ Non custom chains ------

# https://learn.microsoft.com/en-us/azure/virtual-network/what-is-ip-address-168-63-129-16
# "IP address 168.63.129.16 is a virtual public IP address that is used to facilitate a communication channel to Azure platform resources. "
# There are several iptable rules in the security table preinstalled on Azure's Linux VMs that restrict
# who can communicate with this IP.
# This rule is to allow us to communicate with it.

iptables -t security -I OUTPUT -p tcp -d "${azure_platform_resources_ip}" -m owner --gid-owner "${aembit_group_id}" -j ACCEPT
