
    $i@                        d Z ddlmZmZmZmZmZmZmZ	 dZ
dZdZdZdZdZd	Zd
ZdZdZi Zi Zi Zi Zd Zd Z e        [[d Zd Zd Zd ZdddZd Zd Z d Z!d Z"d Z#e"e e#e!dZ$d Z%e&dk(  rddl'Z' e'jP                          yy)a  Core functions for Unicode normalization.

This module provides the main functions for applying the four Unicode
normalization forms: NFC, NFD, NFKC, and NFKD. It relies on data files
from the Unicode character database (UCD) associated with version 17.0
of the Unicode Standard. As a result, the module can only fully handle
the characters defined in version 17.0 and produces results consistent
with the definitions and rules of that version.
    )_COMPOSITION_EXCLUSIONS_DECOMP_BY_CHARACTER_NFC__QC_NO_OR_MAYBE_NFD__QC_NO_NFKC_QC_NO_OR_MAYBE_NFKD_QC_NO_NON_ZERO_CCC_TABLEi   i  i   i  ia  iu  i  i        c                     | D ]t  }|g}|r_g }|D ]M  }|t         v r|j                  t         |          $|| v r|j                  | |          =|j                  |       O ||k(  rn|}|r_|x| |<   t         |<   v y N)_SHARED_FULL_DECOMP_CACHEextendappend)decomp_dictkeydecompositionnext_decompks        K/var/www/br/venv/lib/python3.12/site-packages/pyunormalize/normalization.py_compute_full_decompositionsr   =   s      JK" *11&&'@'CD+%&&{1~6&&q)* m+'M $ =JIC4S9+J    c                  2   t        j                         D ]b  \  } }t        |d   t              r>t	        |      dk(  r|d   t
        vr| t        t        |      <   |xt        | <   t        | <   W|dd  t        | <   d t        t               t        t               y )Nr         )r   items
isinstanceintlen_CCC_COMPOSITE_BY_CDECOMPtuple_FULL_CDECOMP_BY_CHAR_FULL_KDECOMP_BY_CHARr   )r   vals     r   _init_normalization_tablesr&   ^   s    (..0 1Sc!fc"3x1}Qt!347%eCj1FII!#&)>s)C *-QR!#&1 !!67 !!67r   c                 n    d}| D ].  }t        |      }||v r y|t        vrt        |   x}|k  r y|}0 y)Nr   FT)ordr    )stringquick_check_setprev_ccccharcpcurr_cccs         r   _quick_checkr/   y   sT    
 H Y T>R HH, r   c                     | t         z
  }|t        z  }||z
  t        z  }t        |t        z  z   }t        |t        z  z   }|r||t
        dz
  |z   fS ||fS Nr   )_SB_TCOUNT_VB_VCOUNT_LB_TB)r-   sindextindexqVLs         r   _decompose_hangul_syllabler=      sc     #XFgF	&W$AqG|AqG|A1cAg&''q6Mr   c                 B   t         | cxk  r	t        k  rAn n>t        |cxk  r	t        k  r+n n(t        | t         z
  t
        z  |z   t        z
  t        z  z   S t        | cxk  r	t        k  r8n y | t        z
  t        z  s%t        |cxk  r	t        k  rn y | |z   t        dz
  z
  S y r1   )
r6   _LLr4   _VLr2   r5   r3   _SLr7   _TL)r-   next_cps     r   _compose_hangul_syllablerD      s     bCC71c1SG+w6<GGG 	rS  cW$7!c!  G|sQw'' r   c                 B   t        |       }|dkD  rqd}d}||k  r`| |   }|t        vr|dz  }| |dz
     }|t        vst        |   t        |   k  r|dz  }A| |   | |dz
     c| |dz
  <   | |<   |}|dz  }||k  r`|}|dkD  rqdj                  t        t        |             S )Nr   r   r    )r   r    joinmapchr)
codepointssizeswap_posicurrprevs         r   _reorderrP      s    
 z?D ($ha=D 4Qa!e$D 44:d#;Q 0:!}jQ>O,Jq1uz!}HFA- $h2 ; (> 773sJ'((r   Fcompatibilityc                   g }|rt         nt        }t        t        |       D ]]  }||v r|j	                  ||          t
        |cxk  r	t        k  rn n|j	                  t        |             M|j                  |       _ |S r   )	r$   r#   rH   r(   r   r2   rA   r=   r   )r)   rR   rJ   decompr-   s        r   
_decomposerU      sz     J&3"9NF#v "<fRj)B#8<=b!" r   c           	         g t        t        |       }t        |      D ]  \  }}||t        v rd}d}t        ||dz   d  |dz         D ]}  \  }}|t        v rd}nd}|r|r||dz
     }||t        vst        |   t        |   k  s>||f}	|	t        v r
t        |	   }
nt        |	 }
|
r|
t        vrd ||<   |
x||<   }|rd}wd}z|s}   dj                  t        t        t        d |                  S )NFr   TrF   )
rH   r(   	enumerater    r!   rD   r   rG   rI   filter)r)   rJ   rM   r-   
is_blockedlast_is_combiningjrC   prev_cppairprecomposed_chars              r   _composer_      sL   
 %3sF#$J :& =2 :t 
! $Jq1uw$7Q? 2	JAw$$(! "
/ Q'G $&=4=0G}00'<T'B$ (@'F$
 %(0GG$(JqM *:9JqMB "%*
,1) "e2	=~ 773sF44566r   c                 N    t        | t              r| S t        t        |             S )uO  Return the normalization form D of the input string.

    Replaces composed characters with their canonically equivalent decomposed
    forms in canonical order, while leaving compatibility characters
    unaffected.

    The function first checks if the input is already in NFD. If so, it returns
    the string unchanged to avoid unnecessary processing.

    Args:
        string (str): The string to be normalized.

    Returns:
        str: The NFD string.

    Examples:
        # Decomposing accented characters
        >>> string = "élève"
        >>> nfd = NFD(string)
        >>> nfd
        'élève'
        >>> nfd != string  # binary content differs
        True
        >>> " ".join([f"{ord(c):04X}" for c in string])
        '00E9 006C 00E8 0076 0065'
        >>> " ".join([f"{ord(c):04X}" for c in nfd])
        '0065 0301 006C 0065 0300 0076 0065'

        # Decomposing Hangul syllables
        >>> string = "한국"
        >>> nfd = NFD(string)
        >>> nfd
        '한국'
        >>> " ".join([f"{ord(c):04X}" for c in string])
        'D55C AD6D'
        >>> " ".join([f"{ord(c):04X}" for c in nfd])
        '1112 1161 11AB 1100 116E 11A8'

        # Compatibility characters are unaffected
        >>> strings = ["ﬃ", "⑴", "²", "ｱｲｳｴｵ"]
        >>> all(NFD(s) == s for s in strings)
        True
    )r/   r   rP   rU   r)   s    r   NFDrb   A  s$    X FK(Jv&''r   c                 R    t        | t              r| S t        t        | d            S )u  Return the normalization form KD of the input string.

    Replaces composed characters with their canonically equivalent decomposed
    forms in canonical order and converts compatibility characters into their
    nominal counterparts.

    The function first checks if the input is already in NFKD. If so, it
    returns the string unchanged to avoid unnecessary processing.

    Args:
        string (str): The string to be normalized.

    Returns:
        str: The NFKD string.

    Examples:
        # NFKD decomposes canonically composed forms
        >>> NFKD("élève") == "élève"
        True

        # NFKD converts compatibility characters
        >>> [NFKD(s) for s in ["ﬃ", "⑴", "²", "ｱｲｳｴｵ"]]
        ['ffi', '(1)', '2', 'アイウエオ']
    TrQ   )r/   r   rP   rU   ra   s    r   NFKDrd   r  s%    2 FK(JvT:;;r   c                 N    t        | t              r| S t        t        |             S )uD  Return the normalization form C of the input string.

    Replaces character sequences with their canonically equivalent composed
    forms whenever possible, while leaving compatibility characters unaffected.

    The function first checks if the input is already in NFC. If so, it returns
    the string unchanged to avoid unnecessary processing.

    Args:
        string (str): The string to be normalized.

    Returns:
        str: The NFC string.

    Examples:
        # Composing accented characters
        >>> string = "élève"
        >>> nfc = NFC(string)
        >>> nfc
        'élève'
        >>> nfc != string  # binary content differs
        True
        >>> " ".join([f"{ord(c):04X}" for c in string])
        '0065 0301 006C 0065 0300 0076 0065'
        >>> " ".join([f"{ord(c):04X}" for c in nfc])
        '00E9 006C 00E8 0076 0065'

        # Composing Hangul syllables
        >>> string = "한국"
        >>> nfc = NFC(string)
        >>> nfc
        '한국'
        >>> " ".join([f"{ord(c):04X}" for c in string])
        '1112 1161 11AB 1100 116E 11A8'
        >>> " ".join([f"{ord(c):04X}" for c in nfc])
        'D55C AD6D'

        # Compatibility characters are unaffected
        >>> strings = ["ﬃ", "⑴", "²", "ｱｲｳｴｵ"]
        >>> all(NFC(s) == s for s in strings)
        True
    )r/   r   r_   rb   ra   s    r   NFCrf     s$    V F01CK  r   c                 N    t        | t              r| S t        t        |             S )u  Return the normalization form KC of the input string.

    Replaces character sequences with their canonically equivalent composed
    forms whenever possible and converts compatibility characters into their
    nominal counterparts.

    The function first checks if the input is already in NFKC. If so, it
    returns the string unchanged to avoid unnecessary processing.

    Args:
        string (str): The string to be normalized.

    Returns:
        str: The NFKC string.

    Examples:
        # NFKC composes canonically decomposed forms
        >>> NFKC("élève") == "élève"
        True

        # NFKC converts compatibility characters
        >>> [NFKC(s) for s in ["ﬃ", "⑴", "²", "ｱｲｳｴｵ"]]
        ['ffi', '(1)', '2', 'アイウエオ']
    )r/   r   r_   rd   ra   s    r   NFKCrh     s#    2 F01DL!!r   )rf   rb   rh   rd   c                     t        |    |      S )uE  Return the normalized form of the input string as specified by `form`.

    This function transforms the input string according to the normalization
    form given in `form`. Supported values are "NFC", "NFD", "NFKC",
    and "NFKD".

    Args:
        form (str): The normalization form to apply, one of "NFC", "NFD",
            "NFKC", or "NFKD".

        string (str): The string to be normalized.

    Returns:
        str: The normalized string.

    Examples:
        >>> normalize("NFKD", "ﬂuﬃness")
        'fluffiness'

        >>> forms = ["NFC", "NFD", "NFKC", "NFKD"]
        >>> string = "ẛ̣"
        >>> def hexpoints(s):
        ...     return " ".join([f"{ord(c):04X}" for c in s])
        >>> [hexpoints(normalize(f, string)) for f in forms]
        ['1E9B 0323', '017F 0323 0307', '1E69', '0073 0323 0307']
    )_NORMALIZATION_FORMS)formr)   s     r   	normalizerl     s    6  %f--r   __main__N))__doc__pyunormalize._unicode_datar   r   r   r   r   r   r	   r    r2   rA   r6   r?   r4   r@   r7   rB   r5   r3   r   r!   r#   r$   r   r&   r/   r=   rD   rP   rU   r_   rb   rd   rf   rh   rj   rl   __name__doctesttestmod r   r   <module>rt      s                  JB8*   30 4')T ). $G7T.(b<<-!`">  #3dK .< zGOO r   