
    L0&jI?                        U d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZ  ej        e          ZdedefdZ edd	h          Z e ej        d
           ej        d           ej        d           ej        d           ej        d           ej        d           ej        d           ej        d           ej        d          h	          Z ej        d           ej        d          fZ edh          Z ej        d          Zdadaeed<   defdZd#dZdej         ej!        z  defdZ"dedefdZ#dededefd Z$dedefd!Z%dedefd"Z&dS )$uL  URL safety checks — blocks requests to private/internal network addresses.

Prevents SSRF (Server-Side Request Forgery) where a malicious prompt or
skill could trick the agent into fetching internal resources like cloud
metadata endpoints (169.254.169.254), localhost services, or private
network hosts.

The check can be globally disabled via ``security.allow_private_urls: true``
in config.yaml for environments where DNS resolves external domains to
private/benchmark-range IPs (OpenWrt routers, corporate proxies, VPNs
that use 198.18.0.0/15 or 100.64.0.0/10).  Even when disabled, cloud
metadata hostnames (metadata.google.internal, 169.254.169.254) are
**always** blocked — those are never legitimate agent targets.

Limitations (documented, not fixable at pre-flight level):
  - DNS rebinding (TOCTOU): an attacker-controlled DNS server with TTL=0
    can return a public IP for the check, then a private IP for the actual
    connection. Fixing this requires connection-level validation (e.g.
    Python's Champion library or an egress proxy like Stripe's Smokescreen).
  - Redirect-based bypass is mitigated by httpx event hooks that re-validate
    each redirect target in vision_tools, gateway platform adapters, and
    media cache helpers. Web tools use third-party SDKs (Firecrawl/Tavily)
    where redirect handling is on their servers.
    N)quoteurlparseurlsplit
urlunsplit)is_truthy_valueurlreturnc                 h   t          | t                    s| S |                                 }|s|S 	 t          |          }n# t          $ r |cY S w xY w|j                                        dvr|S |j        }|j        }|rY	 |	                    d          
                    d          }n# t          $ r |}Y nw xY w||k    r|                    ||d          }t          |j        d          }t          |j        d          }t          |j        d          }t#          |j        ||||f          S )u  Return an ASCII-safe HTTP URL for Hermes-owned URL tools.

    Browsers and HTTP clients expect URIs, but users and models often provide
    IRIs such as ``https://wttr.in/Köln``.  Preserve URL syntax and existing
    percent escapes while encoding non-ASCII host/path/query/fragment text.
    This is intentionally for URL tool inputs only; arbitrary shell commands
    must not be rewritten.
    >   httphttpsidnaascii   z/%:@!$&'()*+,;=)safez/%:@!$&'()*+,;=?)
isinstancestrstripr   
ValueErrorschemelowernetlochostnameencodedecodeUnicodeErrorreplacer   pathqueryfragmentr   )	r   rawparsedr   r   
ascii_hostr   r   r   s	            5/home/ubuntu/.hermes/hermes-agent/tools/url_safety.pynormalize_url_for_requestr$   &   sf    c3 

))++C 
#   


 }$555
]FH =	"!0077@@JJ 	" 	" 	"!JJJ	"!!^^Hj!<<F#4555D&,%7888EV_+=>>>Hv}fdE8DEEEs#   A AA(B* *B98B9zmetadata.google.internalzmetadata.googz169.254.169.254z169.254.170.2z169.254.169.253zfd00:ec2::254z100.100.100.200z::ffff:169.254.169.254z::ffff:169.254.170.2z::ffff:169.254.169.253z::ffff:100.100.100.200z169.254.0.0/16z::ffff:169.254.0.0/112zmultimedia.nt.qq.com.cnz100.64.0.0/10F_cached_allow_privatec                  z   t           rt          S da dat          j        dd                                                                          } | dv r	dat          S | dv rt          S 	 ddlm}  |            }|                    d	i           }t          |t                    r-t          |                    d
          d          r	dat          S |                    di           }t          |t                    r-t          |                    d
          d          r	dat          S n# t          $ r Y nw xY wt          S )ac  Return True when the user has opted out of private-IP blocking.

    Checks (in priority order):
    1. ``HERMES_ALLOW_PRIVATE_URLS`` env var  (``true``/``1``/``yes``)
    2. ``security.allow_private_urls`` in config.yaml
    3. ``browser.allow_private_urls`` in config.yaml  (legacy / backward compat)

    Result is cached for the process lifetime.
    TFHERMES_ALLOW_PRIVATE_URLS >   1yestrue>   0nofalser   )read_raw_configsecurityallow_private_urls)defaultbrowser)_allow_private_resolvedr%   osgetenvr   r   hermes_cli.configr/   getr   dictr   	Exception)env_valr/   cfgsecr3   s        r#   _global_allow_private_urlsr>      sw     %$$"! i3R88>>@@FFHHG&&& $$$&&&$$555555oggj"%%c4   	)_GG())5&
 &
 &
 	) %)!(('')R((gt$$ 	)KK,--u*
 *
 *
 	) %)!((    ! s   %A'D& AD& &
D32D3c                      da dadS )u+   Reset the cached toggle — only for tests.FN)r4   r%        r#   _reset_allow_private_cacherB      s     $!rA   ipc                 .   t          | t          j                  rA| j        :| j        }|j        p+|j        p$|j        p|j        p|j        p|j	        p|t          v S | j        s| j        s| j        s| j        rdS | j        s| j	        rdS | t          v rdS dS )z<Return True if the IP should be blocked for SSRF protection.NTF)r   	ipaddressIPv6Addressipv4_mapped
is_privateis_loopbackis_link_localis_reservedis_multicastis_unspecified_CGNAT_NETWORK)rC   embedded_ips     r#   _is_blocked_iprP      s     "i+,, /1Kn& .+*A .).-8-D.(.,7,F. ~-	/ 
}  "*: bn t	 "+ t	^t5rA   c                   	
 	 t          |           }|j        pd                                                                                    d          }|sdS |t
          v rt                              d|           dS 	 t          j	        |          	n# t          $ r d	Y nw xY w	H	t          v s t          	fdt          D                       rt                              d|           dS dS 	 t          j        |dt          j        t          j                  }n# t          j        $ r Y dS w xY w|D ]~\  }}}}}|d	         }	 t          j	        |          
n# t          $ r Y 2w xY w
t          v s t          
fd
t          D                       rt                              d||            dS dS # t&          $ r'}t                              d| |           Y d}~dS d}~ww xY w)u  Return True when the URL targets an always-blocked endpoint.

    This is the security floor — cloud metadata IPs / hostnames
    (169.254.169.254, metadata.google.internal, ECS task metadata, etc.)
    that have no legitimate agent use regardless of backend, routing, or
    the ``allow_private_urls`` toggle.  Used by callers that bypass the
    full ``is_safe_url`` check for their own reasons (e.g. hybrid cloud
    browser routing to a local Chromium sidecar for private URLs) and
    still need to enforce the non-negotiable floor before letting the
    request proceed.

    Returns True (= blocked) on:
      - Hostnames in ``_BLOCKED_HOSTNAMES``
      - IPs / networks in ``_ALWAYS_BLOCKED_IPS`` / ``_ALWAYS_BLOCKED_NETWORKS``
      - URLs whose hostname resolves to any of the above

    Returns False (= not in the always-blocked floor) on:
      - Benign public / private / loopback URLs (whether or not they'd
        be blocked by the ordinary SSRF check)
      - DNS-resolution failures for non-sentinel hostnames (these are
        someone else's problem — the caller's ordinary fail-closed path
        will catch them if applicable)
      - Parse errors (caller decides fail-open vs fail-closed)

    Intentionally narrower than ``is_safe_url``: only blocks the sentinel
    set, not ordinary private addresses.  Callers that want the full
    SSRF check should still use ``is_safe_url``.
    r(   .Fz?Blocked request to internal hostname (always-blocked floor): %sTNc              3       K   | ]}|v V  	d S Nr@   .0netrC   s     r#   	<genexpr>z(is_always_blocked_url.<locals>.<genexpr>  s8       0 0!c	0 0 0 0 0 0rA   zDBlocked request to cloud metadata address (always-blocked floor): %sr   c              3       K   | ]}|v V  	d S rT   r@   )rV   rW   resolveds     r#   rX   z(is_always_blocked_url.<locals>.<genexpr>!  s8       6 6$'C6 6 6 6 6 6rA   zJBlocked request to cloud metadata address (always-blocked floor): %s -> %sz&is_always_blocked_url error for %s: %s)r   r   r   r   rstrip_BLOCKED_HOSTNAMESloggerwarningrE   
ip_addressr   _ALWAYS_BLOCKED_IPSany_ALWAYS_BLOCKED_NETWORKSsocketgetaddrinfo	AF_UNSPECSOCK_STREAMgaierrorr:   debug)r   r!   r   	addr_info_family_sockaddrip_strexcrC   rZ   s            @@r#   is_always_blocked_urlro      s   :@#O)r002288::AA#FF 	5 )))NNQ   4	%h//BB 	 	 	BBB	 >(((C 0 0 0 0%=0 0 0 - -( 1  
 t5	*$ 0&2D II  	 	 	55	 +4 	 	&GQ1ha[F$/77   ...# 6 6 6 6+C6 6 6 3 3. 7	   tt / u    	=sCHHHuuuuu	s   AF5 $F5 >B F5 B"F5 !B""A	F5 0+D F5 D/+F5 .D//F5 EF5 
E'$F5 &E''A	F5 2F5 5
G&?G!!G&r   r   c                      |dk    o| t           v S )zGReturn True when a trusted HTTPS hostname may bypass IP-class blocking.r   )_TRUSTED_PRIVATE_IP_HOSTS)r   r   s     r#   _allows_private_ip_resolutionrr   5  s    WF-F!FFrA   c                    	 t          |           }|j        pd                                                                                    d          }|j        pd                                                                }|dvrt                              d|pd           dS |sdS |t          v rt                              d|           dS t                      }t          ||          }	 t          j        |dt          j        t          j                  }n1# t          j        $ r t                              d	|           Y dS w xY w|D ]\  }}}}}	|	d
         }
	 t!          j        |
          n# t$          $ r Y 2w xY wt&          v s t)          fdt*          D                       rt                              d||
            dS |s0|s.t-                    rt                              d||
            dS |rt                              d|           n|rt                              d|           dS # t0          $ r'}t                              d| |           Y d}~dS d}~ww xY w)u  Return True if the URL target is not a private/internal address.

    Resolves the hostname to an IP and checks against private ranges.
    Fails closed: DNS errors and unexpected exceptions block the request.

    When ``security.allow_private_urls`` is enabled (or the env var
    ``HERMES_ALLOW_PRIVATE_URLS=true``), private-IP blocking is skipped.
    Cloud metadata endpoints (169.254.169.254, metadata.google.internal)
    remain blocked regardless — they are never legitimate agent targets.
    r(   rR   >   r   r   u.   Blocked request — unsupported URL scheme: %sz<empty>Fz(Blocked request to internal hostname: %sNu1   Blocked request — DNS resolution failed for: %sr   c              3       K   | ]}|v V  	d S rT   r@   rU   s     r#   rX   zis_safe_url.<locals>.<genexpr>j  s'      /^/^cc	/^/^/^/^/^/^rA   z3Blocked request to cloud metadata address: %s -> %sz5Blocked request to private/internal address: %s -> %szKAllowing private/internal resolution (security.allow_private_urls=true): %szAAllowing trusted hostname despite private/internal resolution: %sTu5   Blocked request — URL safety check error for %s: %s)r   r   r   r   r[   r   r]   r^   r\   r>   rr   rc   rd   re   rf   rg   rE   r_   r   r`   ra   rb   rP   rh   r:   )r   r!   r   r   allow_all_privateallow_private_ipri   familyrk   rl   rm   rn   rC   s               @r#   is_safe_urlrx   :  s   D#O)r002288::AA#FF-%2,,..4466***NNKVM`W`aaa5 	5 )))NNExPPP5 78886JJ	*8T6;KVM_``II 	 	 	 NNNPXYYY55		 *3 	 	%FAq!Xa[F)&11    (((C/^/^/^/^E]/^/^/^,^,^(If   uu$ -= .QSBTBT Kf   uu 		LL]     	LLS  
 t    	NPSUXYYYuuuuu	s   BH8 "H8 &$H8 H8 ++D H8 *EH8 EH8 E0/H8 0
E=:H8 <E==A	H8 0H8 :<H8 8
I)I$$I)c                 F   K   t          j        t          |            d{V S )zSame rules as :func:`is_safe_url`, but run the DNS work off the event loop.

    ``socket.getaddrinfo`` can block; call this from async code paths (gateway,
    ``web_extract_tool``, vision download hooks) instead of ``is_safe_url``.
    N)asyncio	to_threadrx   )r   s    r#   async_is_safe_urlr|     s-       ";444444444rA   )r	   N)'__doc__rE   loggingr5   rc   rz   urllib.parser   r   r   r   utilsr   	getLogger__name__r]   r   r$   	frozensetr\   r_   r`   
ip_networkrb   rq   rN   r4   r%   bool__annotations__r>   rB   IPv4AddressrF   rP   ro   rr   rx   r|   r@   rA   r#   <module>r      s    2      				   > > > > > > > > > > > > ! ! ! ! ! !		8	$	$&F3 &F3 &F &F &F &FV Y      iI*++I))I*++I))I*++I122I/00I122I122!    I)**I122  &I'    &%o66   # t # # #0!D 0! 0! 0! 0!f" " " "y,y/DD     ,]s ]t ] ] ] ]@GC G G G G G G
OS OT O O O Od5 5 5 5 5 5 5 5rA   