From eb39b8c81a37891e412baf40819693a95efeee5d Mon Sep 17 00:00:00 2001 From: Frost Ming Date: Wed, 17 Nov 2021 11:07:57 +0800 Subject: [PATCH 1/5] Upgrade charset-normalizer to 2.0.7 --- pipenv/vendor/charset_normalizer/__init__.py | 44 +- pipenv/vendor/charset_normalizer/api.py | 399 +++-- .../charset_normalizer/assets/__init__.py | 1286 ++++++++++++++++- pipenv/vendor/charset_normalizer/cd.py | 196 ++- .../charset_normalizer/cli/normalizer.py | 323 +++-- pipenv/vendor/charset_normalizer/constant.py | 687 +++++---- pipenv/vendor/charset_normalizer/legacy.py | 79 +- pipenv/vendor/charset_normalizer/md.py | 263 ++-- pipenv/vendor/charset_normalizer/models.py | 166 ++- pipenv/vendor/charset_normalizer/utils.py | 143 +- pipenv/vendor/charset_normalizer/version.py | 4 +- pipenv/vendor/vendor.txt | 2 +- 12 files changed, 2745 insertions(+), 847 deletions(-) diff --git a/pipenv/vendor/charset_normalizer/__init__.py b/pipenv/vendor/charset_normalizer/__init__.py index 550c205f..ed525034 100644 --- a/pipenv/vendor/charset_normalizer/__init__.py +++ b/pipenv/vendor/charset_normalizer/__init__.py @@ -1,3 +1,4 @@ +# -*- coding: utf_8 -*- """ Charset-Normalizer ~~~~~~~~~~~~~~ @@ -8,24 +9,39 @@ All IANA character set names for which the Python core library provides codecs a Basic usage: >>> from charset_normalizer import from_bytes - >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'.encode('utf_8')) - >>> "utf_8" in results - True - >>> best_result = results.best() - >>> str(best_result) - 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.' + >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8')) + >>> best_guess = results.best() + >>> str(best_guess) + 'Bсеки човек има право на образование. Oбразованието!' Others methods and usages are available - see the full documentation at . :copyright: (c) 2021 by Ahmed TAHRI :license: MIT, see LICENSE for more details. """ -from pipenv.vendor.charset_normalizer.api import from_fp, from_path, from_bytes, normalize -from pipenv.vendor.charset_normalizer.legacy import detect -from pipenv.vendor.charset_normalizer.version import __version__, VERSION -from pipenv.vendor.charset_normalizer.models import CharsetMatch, CharsetMatches +from .api import from_bytes, from_fp, from_path, normalize +from .legacy import ( + CharsetDetector, + CharsetDoctor, + CharsetNormalizerMatch, + CharsetNormalizerMatches, + detect, +) +from .models import CharsetMatch, CharsetMatches +from .version import VERSION, __version__ -# Backward-compatible v1 imports -from pipenv.vendor.charset_normalizer.models import CharsetNormalizerMatch -import pipenv.vendor.charset_normalizer.api as CharsetDetector -CharsetNormalizerMatches = CharsetDetector +__all__ = ( + "from_fp", + "from_path", + "from_bytes", + "normalize", + "detect", + "CharsetMatch", + "CharsetMatches", + "CharsetNormalizerMatch", + "CharsetNormalizerMatches", + "CharsetDetector", + "CharsetDoctor", + "__version__", + "VERSION", +) diff --git a/pipenv/vendor/charset_normalizer/api.py b/pipenv/vendor/charset_normalizer/api.py index bd555d9e..dce7cf30 100644 --- a/pipenv/vendor/charset_normalizer/api.py +++ b/pipenv/vendor/charset_normalizer/api.py @@ -1,38 +1,48 @@ -from os.path import splitext, basename -from typing import List, BinaryIO, Optional, Set, Union +from os.path import basename, splitext +from typing import BinaryIO, List, Optional, Set try: from os import PathLike -except ImportError: - PathLike = Union[str, 'os.PathLike[str]'] # type: ignore +except ImportError: # pragma: no cover + PathLike = str # type: ignore -from pipenv.vendor.charset_normalizer.constant import TOO_SMALL_SEQUENCE, TOO_BIG_SEQUENCE, IANA_SUPPORTED -from pipenv.vendor.charset_normalizer.md import mess_ratio -from pipenv.vendor.charset_normalizer.models import CharsetMatches, CharsetMatch -from warnings import warn import logging -from pipenv.vendor.charset_normalizer.utils import any_specified_encoding, is_multi_byte_encoding, identify_sig_or_bom, \ - should_strip_sig_or_bom, is_cp_similar, iana_name -from pipenv.vendor.charset_normalizer.cd import coherence_ratio, encoding_languages, mb_encoding_languages, merge_coherence_ratios +from .cd import ( + coherence_ratio, + encoding_languages, + mb_encoding_languages, + merge_coherence_ratios, +) +from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE +from .md import mess_ratio +from .models import CharsetMatch, CharsetMatches +from .utils import ( + any_specified_encoding, + iana_name, + identify_sig_or_bom, + is_cp_similar, + is_multi_byte_encoding, + should_strip_sig_or_bom, +) logger = logging.getLogger("charset_normalizer") logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() -handler.setFormatter(logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')) +handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")) logger.addHandler(handler) def from_bytes( - sequences: bytes, - steps: int = 5, - chunk_size: int = 512, - threshold: float = 0.2, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, - preemptive_behaviour: bool = True, - explain: bool = False + sequences: bytes, + steps: int = 5, + chunk_size: int = 512, + threshold: float = 0.2, + cp_isolation: List[str] = None, + cp_exclusion: List[str] = None, + preemptive_behaviour: bool = True, + explain: bool = False, ) -> CharsetMatches: """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. @@ -49,6 +59,13 @@ def from_bytes( This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32. """ + if not isinstance(sequences, (bytearray, bytes)): + raise TypeError( + "Expected object of type bytes or bytearray, got: {0}".format( + type(sequences) + ) + ) + if not explain: logger.setLevel(logging.CRITICAL) else: @@ -57,41 +74,38 @@ def from_bytes( length = len(sequences) # type: int if length == 0: - logger.warning("Given content is empty, stopping the process very early, returning empty utf_8 str match") - return CharsetMatches( - [ - CharsetMatch( - sequences, - "utf_8", - 0., - False, - [], - "" - ) - ] + logger.warning( + "Given content is empty, stopping the process very early, returning empty utf_8 str match" ) + return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")]) if cp_isolation is not None: - logger.warning('cp_isolation is set. use this flag for debugging purpose. ' - 'limited list of encoding allowed : %s.', - ', '.join(cp_isolation)) + logger.warning( + "cp_isolation is set. use this flag for debugging purpose. " + "limited list of encoding allowed : %s.", + ", ".join(cp_isolation), + ) cp_isolation = [iana_name(cp, False) for cp in cp_isolation] else: cp_isolation = [] if cp_exclusion is not None: logger.warning( - 'cp_exclusion is set. use this flag for debugging purpose. ' - 'limited list of encoding excluded : %s.', - ', '.join(cp_exclusion)) + "cp_exclusion is set. use this flag for debugging purpose. " + "limited list of encoding excluded : %s.", + ", ".join(cp_exclusion), + ) cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion] else: cp_exclusion = [] if length <= (chunk_size * steps): logger.warning( - 'override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.', - steps, chunk_size, length) + "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.", + steps, + chunk_size, + length, + ) steps = 1 chunk_size = length @@ -102,15 +116,30 @@ def from_bytes( is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool if is_too_small_sequence: - warn('Trying to detect encoding from a tiny portion of ({}) byte(s).'.format(length)) + logger.warning( + "Trying to detect encoding from a tiny portion of ({}) byte(s).".format( + length + ) + ) + elif is_too_large_sequence: + logger.info( + "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format( + length + ) + ) prioritized_encodings = [] # type: List[str] - specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None # type: Optional[str] + specified_encoding = ( + any_specified_encoding(sequences) if preemptive_behaviour is True else None + ) # type: Optional[str] if specified_encoding is not None: prioritized_encodings.append(specified_encoding) - logger.info('Detected declarative mark in sequence. Priority +1 given for %s.', specified_encoding) + logger.info( + "Detected declarative mark in sequence. Priority +1 given for %s.", + specified_encoding, + ) tested = set() # type: Set[str] tested_but_hard_failure = [] # type: List[str] @@ -118,9 +147,7 @@ def from_bytes( fallback_ascii = None # type: Optional[CharsetMatch] fallback_u8 = None # type: Optional[CharsetMatch] - - single_byte_hard_failure_count = 0 # type: int - single_byte_soft_failure_count = 0 # type: int + fallback_specified = None # type: Optional[CharsetMatch] results = CharsetMatches() # type: CharsetMatches @@ -128,14 +155,18 @@ def from_bytes( if sig_encoding is not None: prioritized_encodings.append(sig_encoding) - logger.info('Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.', len(sig_payload), sig_encoding) + logger.info( + "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.", + len(sig_payload), + sig_encoding, + ) prioritized_encodings.append("ascii") if "utf_8" not in prioritized_encodings: prioritized_encodings.append("utf_8") - for encoding_iana in prioritized_encodings+IANA_SUPPORTED: + for encoding_iana in prioritized_encodings + IANA_SUPPORTED: if cp_isolation and encoding_iana not in cp_isolation: continue @@ -150,39 +181,48 @@ def from_bytes( decoded_payload = None # type: Optional[str] bom_or_sig_available = sig_encoding == encoding_iana # type: bool - strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(encoding_iana) # type: bool + strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom( + encoding_iana + ) # type: bool if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False: - logger.info("Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.", encoding_iana) + logger.info( + "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.", + encoding_iana, + ) continue try: is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool except (ModuleNotFoundError, ImportError): - logger.debug("Encoding %s does not provide an IncrementalDecoder", encoding_iana) + logger.debug( + "Encoding %s does not provide an IncrementalDecoder", encoding_iana + ) continue try: if is_too_large_sequence and is_multi_byte_decoder is False: str( - sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)], - encoding=encoding_iana + sequences[: int(50e4)] + if strip_sig_or_bom is False + else sequences[len(sig_payload) : int(50e4)], + encoding=encoding_iana, ) else: decoded_payload = str( - sequences if strip_sig_or_bom is False else sequences[len(sig_payload):], - encoding=encoding_iana + sequences + if strip_sig_or_bom is False + else sequences[len(sig_payload) :], + encoding=encoding_iana, + ) + except (UnicodeDecodeError, LookupError) as e: + if not isinstance(e, LookupError): + logger.warning( + "Code page %s does not fit given bytes sequence at ALL. %s", + encoding_iana, + str(e), ) - except UnicodeDecodeError as e: - logger.warning('Code page %s does not fit given bytes sequence at ALL. %s', encoding_iana, str(e)) tested_but_hard_failure.append(encoding_iana) - if not is_multi_byte_decoder: - single_byte_hard_failure_count += 1 - continue - except LookupError: - tested_but_hard_failure.append(encoding_iana) - if not is_multi_byte_decoder: - single_byte_hard_failure_count += 1 continue similar_soft_failure_test = False # type: bool @@ -193,19 +233,31 @@ def from_bytes( break if similar_soft_failure_test: - logger.warning("%s is deemed too similar to code page %s and was consider unsuited already. Continuing!", encoding_iana, encoding_soft_failed) + logger.warning( + "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!", + encoding_iana, + encoding_soft_failed, + ) continue r_ = range( 0 if bom_or_sig_available is False else len(sig_payload), length, - int(length / steps) + int(length / steps), ) - multi_byte_bonus = is_multi_byte_decoder and decoded_payload is not None and len(decoded_payload) < length # type: bool + multi_byte_bonus = ( + is_multi_byte_decoder + and decoded_payload is not None + and len(decoded_payload) < length + ) # type: bool if multi_byte_bonus: - logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes. Should not be a coincidence. Priority +1 given.', encoding_iana) + logger.info( + "Code page %s is a multi byte encoding table and it appear that at least one character " + "was encoded using n-bytes.", + encoding_iana, + ) max_chunk_gave_up = int(len(r_) / 4) # type: int @@ -218,62 +270,79 @@ def from_bytes( md_ratios = [] for i in r_: - cut_sequence = sequences[i:i + chunk_size] + cut_sequence = sequences[i : i + chunk_size] if bom_or_sig_available and strip_sig_or_bom is False: - cut_sequence = sig_payload+cut_sequence + cut_sequence = sig_payload + cut_sequence chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str + # multi-byte bad cutting detector and adjustment + # not the cleanest way to perform that fix but clever enough for now. + if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + + chunk_partial_size_chk = ( + 16 if chunk_size > 16 else chunk_size + ) # type: int + + if ( + decoded_payload + and chunk[:chunk_partial_size_chk] not in decoded_payload + ): + for j in range(i, i - 4, -1): + cut_sequence = sequences[j : i + chunk_size] + + if bom_or_sig_available and strip_sig_or_bom is False: + cut_sequence = sig_payload + cut_sequence + + chunk = cut_sequence.decode(encoding_iana, errors="ignore") + + if chunk[:chunk_partial_size_chk] in decoded_payload: + break + md_chunks.append(chunk) - md_ratios.append( - mess_ratio( - chunk, - threshold - ) - ) + md_ratios.append(mess_ratio(chunk, threshold)) if md_ratios[-1] >= threshold: early_stop_count += 1 - if (early_stop_count >= max_chunk_gave_up) or (bom_or_sig_available and strip_sig_or_bom is False): + if (early_stop_count >= max_chunk_gave_up) or ( + bom_or_sig_available and strip_sig_or_bom is False + ): break if md_ratios: mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float else: - mean_mess_ratio = 0. + mean_mess_ratio = 0.0 if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up: tested_but_soft_failure.append(encoding_iana) - if not is_multi_byte_decoder: - single_byte_soft_failure_count += 1 - logger.warning('%s was excluded because of initial chaos probing. Gave up %i time(s). ' - 'Computed mean chaos is %f %%.', - encoding_iana, - early_stop_count, - round(mean_mess_ratio * 100, ndigits=3)) + logger.warning( + "%s was excluded because of initial chaos probing. Gave up %i time(s). " + "Computed mean chaos is %f %%.", + encoding_iana, + early_stop_count, + round(mean_mess_ratio * 100, ndigits=3), + ) # Preparing those fallbacks in case we got nothing. - if encoding_iana in ["ascii", "utf_8"]: + if encoding_iana in ["ascii", "utf_8", specified_encoding]: fallback_entry = CharsetMatch( - sequences, - encoding_iana, - threshold, - False, - [], - decoded_payload + sequences, encoding_iana, threshold, False, [], decoded_payload ) - if encoding_iana == "ascii": + if encoding_iana == specified_encoding: + fallback_specified = fallback_entry + elif encoding_iana == "ascii": fallback_ascii = fallback_entry else: fallback_u8 = fallback_entry continue logger.info( - '%s passed initial chaos probing. Mean measured chaos is %f %%', + "%s passed initial chaos probing. Mean measured chaos is %f %%", encoding_iana, - round(mean_mess_ratio * 100, ndigits=3) + round(mean_mess_ratio * 100, ndigits=3), ) if not is_multi_byte_decoder: @@ -282,21 +351,29 @@ def from_bytes( target_languages = mb_encoding_languages(encoding_iana) if target_languages: - logger.info("{} should target any language(s) of {}".format(encoding_iana, str(target_languages))) + logger.info( + "{} should target any language(s) of {}".format( + encoding_iana, str(target_languages) + ) + ) cd_ratios = [] for chunk in md_chunks: - chunk_languages = coherence_ratio(chunk, 0.1, ",".join(target_languages) if target_languages else None) - - cd_ratios.append( - chunk_languages + chunk_languages = coherence_ratio( + chunk, 0.1, ",".join(target_languages) if target_languages else None ) + cd_ratios.append(chunk_languages) + cd_ratios_merged = merge_coherence_ratios(cd_ratios) if cd_ratios_merged: - logger.info("We detected language {} using {}".format(cd_ratios_merged, encoding_iana)) + logger.info( + "We detected language {} using {}".format( + cd_ratios_merged, encoding_iana + ) + ) results.append( CharsetMatch( @@ -305,37 +382,46 @@ def from_bytes( mean_mess_ratio, bom_or_sig_available, cd_ratios_merged, - decoded_payload + decoded_payload, ) ) - if encoding_iana in [specified_encoding, "ascii", "utf_8"] and mean_mess_ratio < 0.1: - logger.info("%s is most likely the one. Stopping the process.", encoding_iana) - return CharsetMatches( - [results[encoding_iana]] + if ( + encoding_iana in [specified_encoding, "ascii", "utf_8"] + and mean_mess_ratio < 0.1 + ): + logger.info( + "%s is most likely the one. Stopping the process.", encoding_iana ) + return CharsetMatches([results[encoding_iana]]) if encoding_iana == sig_encoding: logger.info( "%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.", - encoding_iana - ) - return CharsetMatches( - [results[encoding_iana]] - ) - - if results[-1].languages: - logger.info( - "Using %s code page we detected the following languages: %s", encoding_iana, - results[-1]._languages ) + return CharsetMatches([results[encoding_iana]]) if len(results) == 0: - if fallback_u8 or fallback_ascii: - logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8 fallback.") + if fallback_u8 or fallback_ascii or fallback_specified: + logger.warning( + "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback." + ) - if (fallback_u8 and fallback_ascii is None) or (fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint): + if fallback_specified: + logger.warning( + "%s will be used as a fallback match", fallback_specified.encoding + ) + results.append(fallback_specified) + elif ( + (fallback_u8 and fallback_ascii is None) + or ( + fallback_u8 + and fallback_ascii + and fallback_u8.fingerprint != fallback_ascii.fingerprint + ) + or (fallback_u8 is not None) + ): logger.warning("utf_8 will be used as a fallback match") results.append(fallback_u8) elif fallback_ascii: @@ -346,14 +432,14 @@ def from_bytes( def from_fp( - fp: BinaryIO, - steps: int = 5, - chunk_size: int = 512, - threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, - preemptive_behaviour: bool = True, - explain: bool = False + fp: BinaryIO, + steps: int = 5, + chunk_size: int = 512, + threshold: float = 0.20, + cp_isolation: List[str] = None, + cp_exclusion: List[str] = None, + preemptive_behaviour: bool = True, + explain: bool = False, ) -> CharsetMatches: """ Same thing than the function from_bytes but using a file pointer that is already ready. @@ -367,29 +453,46 @@ def from_fp( cp_isolation, cp_exclusion, preemptive_behaviour, - explain + explain, ) def from_path( - path: PathLike, - steps: int = 5, - chunk_size: int = 512, - threshold: float = 0.20, - cp_isolation: List[str] = None, - cp_exclusion: List[str] = None, - preemptive_behaviour: bool = True, - explain: bool = False + path: PathLike, + steps: int = 5, + chunk_size: int = 512, + threshold: float = 0.20, + cp_isolation: List[str] = None, + cp_exclusion: List[str] = None, + preemptive_behaviour: bool = True, + explain: bool = False, ) -> CharsetMatches: """ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. Can raise IOError. """ - with open(path, 'rb') as fp: - return from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain) + with open(path, "rb") as fp: + return from_fp( + fp, + steps, + chunk_size, + threshold, + cp_isolation, + cp_exclusion, + preemptive_behaviour, + explain, + ) -def normalize(path: PathLike, steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, cp_isolation: List[str] = None, cp_exclusion: List[str] = None, preemptive_behaviour: bool = True) -> CharsetMatch: +def normalize( + path: PathLike, + steps: int = 5, + chunk_size: int = 512, + threshold: float = 0.20, + cp_isolation: List[str] = None, + cp_exclusion: List[str] = None, + preemptive_behaviour: bool = True, +) -> CharsetMatch: """ Take a (text-based) file path and try to create another file next to it, this time using UTF-8. """ @@ -400,22 +503,26 @@ def normalize(path: PathLike, steps: int = 5, chunk_size: int = 512, threshold: threshold, cp_isolation, cp_exclusion, - preemptive_behaviour + preemptive_behaviour, ) filename = basename(path) target_extensions = list(splitext(filename)) if len(results) == 0: - raise IOError('Unable to normalize "{}", no encoding charset seems to fit.'.format(filename)) + raise IOError( + 'Unable to normalize "{}", no encoding charset seems to fit.'.format( + filename + ) + ) result = results.best() - target_extensions[0] += '-' + result.encoding # type: ignore + target_extensions[0] += "-" + result.encoding # type: ignore - with open('{}'.format(path.replace(filename, ''.join(target_extensions))), 'wb') as fp: - fp.write( - result.output() # type: ignore - ) + with open( + "{}".format(str(path).replace(filename, "".join(target_extensions))), "wb" + ) as fp: + fp.write(result.output()) # type: ignore return result # type: ignore diff --git a/pipenv/vendor/charset_normalizer/assets/__init__.py b/pipenv/vendor/charset_normalizer/assets/__init__.py index 2d937736..b2e56ff3 100644 --- a/pipenv/vendor/charset_normalizer/assets/__init__.py +++ b/pipenv/vendor/charset_normalizer/assets/__init__.py @@ -1,52 +1,1244 @@ -""" -This submodule purpose is to load attached JSON asset. -Will be loaded once per package import / python init. - -The file 'frequencies.json' is mandatory for language/coherence detection. Not having it will weaker considerably -the core detection. -""" +# -*- coding: utf_8 -*- from collections import OrderedDict - FREQUENCIES = OrderedDict( [ - ('English', ['e', 'a', 't', 'i', 'o', 'n', 's', 'r', 'h', 'l', 'd', 'c', 'u', 'm', 'f', 'p', 'g', 'w', 'y', 'b', 'v', 'k', 'x', 'j', 'z', 'q']), - ('German', ['e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', 'l', 'g', 'o', 'c', 'm', 'b', 'f', 'k', 'w', 'z', 'p', 'v', 'ü', 'ä', 'ö', 'j']), - ('French', ['e', 'a', 's', 'n', 'i', 't', 'r', 'l', 'u', 'o', 'd', 'c', 'p', 'm', 'é', 'v', 'g', 'f', 'b', 'h', 'q', 'à', 'x', 'è', 'y', 'j']), - ('Dutch', ['e', 'n', 'a', 'i', 'r', 't', 'o', 'd', 's', 'l', 'g', 'h', 'v', 'm', 'u', 'k', 'c', 'p', 'b', 'w', 'j', 'z', 'f', 'y', 'x', 'ë']), - ('Italian', ['e', 'i', 'a', 'o', 'n', 'l', 't', 'r', 's', 'c', 'd', 'u', 'p', 'm', 'g', 'v', 'f', 'b', 'z', 'h', 'q', 'è', 'à', 'k', 'y', 'ò']), - ('Polish', ['a', 'i', 'o', 'e', 'n', 'r', 'z', 'w', 's', 'c', 't', 'k', 'y', 'd', 'p', 'm', 'u', 'l', 'j', 'ł', 'g', 'b', 'h', 'ą', 'ę', 'ó']), - ('Spanish', ['e', 'a', 'o', 'n', 's', 'r', 'i', 'l', 'd', 't', 'c', 'u', 'm', 'p', 'b', 'g', 'v', 'f', 'y', 'ó', 'h', 'q', 'í', 'j', 'z', 'á']), - ('Russian', ['о', 'а', 'е', 'и', 'н', 'с', 'т', 'р', 'в', 'л', 'к', 'м', 'д', 'п', 'у', 'г', 'я', 'ы', 'з', 'б', 'й', 'ь', 'ч', 'х', 'ж', 'ц']), - ('Japanese', ['の', 'に', 'る', 'た', 'は', 'ー', 'と', 'し', 'を', 'で', 'て', 'が', 'い', 'ン', 'れ', 'な', '年', 'ス', 'っ', 'ル', 'か', 'ら', 'あ', 'さ', 'も', 'り']), - ('Portuguese', ['a', 'e', 'o', 's', 'i', 'r', 'd', 'n', 't', 'm', 'u', 'c', 'l', 'p', 'g', 'v', 'b', 'f', 'h', 'ã', 'q', 'é', 'ç', 'á', 'z', 'í']), - ('Swedish', ['e', 'a', 'n', 'r', 't', 's', 'i', 'l', 'd', 'o', 'm', 'k', 'g', 'v', 'h', 'f', 'u', 'p', 'ä', 'c', 'b', 'ö', 'å', 'y', 'j', 'x']), - ('Chinese', ['的', '一', '是', '不', '了', '在', '人', '有', '我', '他', '这', '个', '们', '中', '来', '上', '大', '为', '和', '国', '地', '到', '以', '说', '时', '要', '就', '出', '会']), - ('Ukrainian', ['о', 'а', 'н', 'і', 'и', 'р', 'в', 'т', 'е', 'с', 'к', 'л', 'у', 'д', 'м', 'п', 'з', 'я', 'ь', 'б', 'г', 'й', 'ч', 'х', 'ц', 'ї']), - ('Norwegian', ['e', 'r', 'n', 't', 'a', 's', 'i', 'o', 'l', 'd', 'g', 'k', 'm', 'v', 'f', 'p', 'u', 'b', 'h', 'å', 'y', 'j', 'ø', 'c', 'æ', 'w']), - ('Finnish', ['a', 'i', 'n', 't', 'e', 's', 'l', 'o', 'u', 'k', 'ä', 'm', 'r', 'v', 'j', 'h', 'p', 'y', 'd', 'ö', 'g', 'c', 'b', 'f', 'w', 'z']), - ('Vietnamese', ['n', 'h', 't', 'i', 'c', 'g', 'a', 'o', 'u', 'm', 'l', 'r', 'à', 'đ', 's', 'e', 'v', 'p', 'b', 'y', 'ư', 'd', 'á', 'k', 'ộ', 'ế']), - ('Czech', ['o', 'e', 'a', 'n', 't', 's', 'i', 'l', 'v', 'r', 'k', 'd', 'u', 'm', 'p', 'í', 'c', 'h', 'z', 'á', 'y', 'j', 'b', 'ě', 'é', 'ř']), - ('Hungarian', ['e', 'a', 't', 'l', 's', 'n', 'k', 'r', 'i', 'o', 'z', 'á', 'é', 'g', 'm', 'b', 'y', 'v', 'd', 'h', 'u', 'p', 'j', 'ö', 'f', 'c']), - ('Korean', ['이', '다', '에', '의', '는', '로', '하', '을', '가', '고', '지', '서', '한', '은', '기', '으', '년', '대', '사', '시', '를', '리', '도', '인', '스', '일']), - ('Indonesian', ['a', 'n', 'e', 'i', 'r', 't', 'u', 's', 'd', 'k', 'm', 'l', 'g', 'p', 'b', 'o', 'h', 'y', 'j', 'c', 'w', 'f', 'v', 'z', 'x', 'q']), - ('Turkish', ['a', 'e', 'i', 'n', 'r', 'l', 'ı', 'k', 'd', 't', 's', 'm', 'y', 'u', 'o', 'b', 'ü', 'ş', 'v', 'g', 'z', 'h', 'c', 'p', 'ç', 'ğ']), - ('Romanian', ['e', 'i', 'a', 'r', 'n', 't', 'u', 'l', 'o', 'c', 's', 'd', 'p', 'm', 'ă', 'f', 'v', 'î', 'g', 'b', 'ș', 'ț', 'z', 'h', 'â', 'j']), - ('Farsi', ['ا', 'ی', 'ر', 'د', 'ن', 'ه', 'و', 'م', 'ت', 'ب', 'س', 'ل', 'ک', 'ش', 'ز', 'ف', 'گ', 'ع', 'خ', 'ق', 'ج', 'آ', 'پ', 'ح', 'ط', 'ص']), - ('Arabic', ['ا', 'ل', 'ي', 'م', 'و', 'ن', 'ر', 'ت', 'ب', 'ة', 'ع', 'د', 'س', 'ف', 'ه', 'ك', 'ق', 'أ', 'ح', 'ج', 'ش', 'ط', 'ص', 'ى', 'خ', 'إ']), - ('Danish', ['e', 'r', 'n', 't', 'a', 'i', 's', 'd', 'l', 'o', 'g', 'm', 'k', 'f', 'v', 'u', 'b', 'h', 'p', 'å', 'y', 'ø', 'æ', 'c', 'j', 'w']), - ('Serbian', ['а', 'и', 'о', 'е', 'н', 'р', 'с', 'у', 'т', 'к', 'ј', 'в', 'д', 'м', 'п', 'л', 'г', 'з', 'б', 'a', 'i', 'e', 'o', 'n', 'ц', 'ш']), - ('Lithuanian', ['i', 'a', 's', 'o', 'r', 'e', 't', 'n', 'u', 'k', 'm', 'l', 'p', 'v', 'd', 'j', 'g', 'ė', 'b', 'y', 'ų', 'š', 'ž', 'c', 'ą', 'į']), - ('Slovene', ['e', 'a', 'i', 'o', 'n', 'r', 's', 'l', 't', 'j', 'v', 'k', 'd', 'p', 'm', 'u', 'z', 'b', 'g', 'h', 'č', 'c', 'š', 'ž', 'f', 'y']), - ('Slovak', ['o', 'a', 'e', 'n', 'i', 'r', 'v', 't', 's', 'l', 'k', 'd', 'm', 'p', 'u', 'c', 'h', 'j', 'b', 'z', 'á', 'y', 'ý', 'í', 'č', 'é']), - ('Hebrew', ['י', 'ו', 'ה', 'ל', 'ר', 'ב', 'ת', 'מ', 'א', 'ש', 'נ', 'ע', 'ם', 'ד', 'ק', 'ח', 'פ', 'ס', 'כ', 'ג', 'ט', 'צ', 'ן', 'ז', 'ך']), - ('Bulgarian', ['а', 'и', 'о', 'е', 'н', 'т', 'р', 'с', 'в', 'л', 'к', 'д', 'п', 'м', 'з', 'г', 'я', 'ъ', 'у', 'б', 'ч', 'ц', 'й', 'ж', 'щ', 'х']), - ('Croatian', ['a', 'i', 'o', 'e', 'n', 'r', 'j', 's', 't', 'u', 'k', 'l', 'v', 'd', 'm', 'p', 'g', 'z', 'b', 'c', 'č', 'h', 'š', 'ž', 'ć', 'f']), - ('Hindi', ['क', 'र', 'स', 'न', 'त', 'म', 'ह', 'प', 'य', 'ल', 'व', 'ज', 'द', 'ग', 'ब', 'श', 'ट', 'अ', 'ए', 'थ', 'भ', 'ड', 'च', 'ध', 'ष', 'इ']), - ('Estonian', ['a', 'i', 'e', 's', 't', 'l', 'u', 'n', 'o', 'k', 'r', 'd', 'm', 'v', 'g', 'p', 'j', 'h', 'ä', 'b', 'õ', 'ü', 'f', 'c', 'ö', 'y']), - ('Simple English', ['e', 'a', 't', 'i', 'o', 'n', 's', 'r', 'h', 'l', 'd', 'c', 'm', 'u', 'f', 'p', 'g', 'w', 'b', 'y', 'v', 'k', 'j', 'x', 'z', 'q']), - ('Thai', ['า', 'น', 'ร', 'อ', 'ก', 'เ', 'ง', 'ม', 'ย', 'ล', 'ว', 'ด', 'ท', 'ส', 'ต', 'ะ', 'ป', 'บ', 'ค', 'ห', 'แ', 'จ', 'พ', 'ช', 'ข', 'ใ']), - ('Greek', ['α', 'τ', 'ο', 'ι', 'ε', 'ν', 'ρ', 'σ', 'κ', 'η', 'π', 'ς', 'υ', 'μ', 'λ', 'ί', 'ό', 'ά', 'γ', 'έ', 'δ', 'ή', 'ω', 'χ', 'θ', 'ύ']), - ('Tamil', ['க', 'த', 'ப', 'ட', 'ர', 'ம', 'ல', 'ன', 'வ', 'ற', 'ய', 'ள', 'ச', 'ந', 'இ', 'ண', 'அ', 'ஆ', 'ழ', 'ங', 'எ', 'உ', 'ஒ', 'ஸ']), - ('Classical Chinese', ['之', '年', '為', '也', '以', '一', '人', '其', '者', '國', '有', '二', '十', '於', '曰', '三', '不', '大', '而', '子', '中', '五', '四'])] + ( + "English", + [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "u", + "m", + "f", + "p", + "g", + "w", + "y", + "b", + "v", + "k", + "x", + "j", + "z", + "q", + ], + ), + ( + "German", + [ + "e", + "n", + "i", + "r", + "s", + "t", + "a", + "d", + "h", + "u", + "l", + "g", + "o", + "c", + "m", + "b", + "f", + "k", + "w", + "z", + "p", + "v", + "ü", + "ä", + "ö", + "j", + ], + ), + ( + "French", + [ + "e", + "a", + "s", + "n", + "i", + "t", + "r", + "l", + "u", + "o", + "d", + "c", + "p", + "m", + "é", + "v", + "g", + "f", + "b", + "h", + "q", + "à", + "x", + "è", + "y", + "j", + ], + ), + ( + "Dutch", + [ + "e", + "n", + "a", + "i", + "r", + "t", + "o", + "d", + "s", + "l", + "g", + "h", + "v", + "m", + "u", + "k", + "c", + "p", + "b", + "w", + "j", + "z", + "f", + "y", + "x", + "ë", + ], + ), + ( + "Italian", + [ + "e", + "i", + "a", + "o", + "n", + "l", + "t", + "r", + "s", + "c", + "d", + "u", + "p", + "m", + "g", + "v", + "f", + "b", + "z", + "h", + "q", + "è", + "à", + "k", + "y", + "ò", + ], + ), + ( + "Polish", + [ + "a", + "i", + "o", + "e", + "n", + "r", + "z", + "w", + "s", + "c", + "t", + "k", + "y", + "d", + "p", + "m", + "u", + "l", + "j", + "ł", + "g", + "b", + "h", + "ą", + "ę", + "ó", + ], + ), + ( + "Spanish", + [ + "e", + "a", + "o", + "n", + "s", + "r", + "i", + "l", + "d", + "t", + "c", + "u", + "m", + "p", + "b", + "g", + "v", + "f", + "y", + "ó", + "h", + "q", + "í", + "j", + "z", + "á", + ], + ), + ( + "Russian", + [ + "о", + "а", + "е", + "и", + "н", + "с", + "т", + "р", + "в", + "л", + "к", + "м", + "д", + "п", + "у", + "г", + "я", + "ы", + "з", + "б", + "й", + "ь", + "ч", + "х", + "ж", + "ц", + ], + ), + ( + "Japanese", + [ + "の", + "に", + "る", + "た", + "は", + "ー", + "と", + "し", + "を", + "で", + "て", + "が", + "い", + "ン", + "れ", + "な", + "年", + "ス", + "っ", + "ル", + "か", + "ら", + "あ", + "さ", + "も", + "り", + ], + ), + ( + "Portuguese", + [ + "a", + "e", + "o", + "s", + "i", + "r", + "d", + "n", + "t", + "m", + "u", + "c", + "l", + "p", + "g", + "v", + "b", + "f", + "h", + "ã", + "q", + "é", + "ç", + "á", + "z", + "í", + ], + ), + ( + "Swedish", + [ + "e", + "a", + "n", + "r", + "t", + "s", + "i", + "l", + "d", + "o", + "m", + "k", + "g", + "v", + "h", + "f", + "u", + "p", + "ä", + "c", + "b", + "ö", + "å", + "y", + "j", + "x", + ], + ), + ( + "Chinese", + [ + "的", + "一", + "是", + "不", + "了", + "在", + "人", + "有", + "我", + "他", + "这", + "个", + "们", + "中", + "来", + "上", + "大", + "为", + "和", + "国", + "地", + "到", + "以", + "说", + "时", + "要", + "就", + "出", + "会", + ], + ), + ( + "Ukrainian", + [ + "о", + "а", + "н", + "і", + "и", + "р", + "в", + "т", + "е", + "с", + "к", + "л", + "у", + "д", + "м", + "п", + "з", + "я", + "ь", + "б", + "г", + "й", + "ч", + "х", + "ц", + "ї", + ], + ), + ( + "Norwegian", + [ + "e", + "r", + "n", + "t", + "a", + "s", + "i", + "o", + "l", + "d", + "g", + "k", + "m", + "v", + "f", + "p", + "u", + "b", + "h", + "å", + "y", + "j", + "ø", + "c", + "æ", + "w", + ], + ), + ( + "Finnish", + [ + "a", + "i", + "n", + "t", + "e", + "s", + "l", + "o", + "u", + "k", + "ä", + "m", + "r", + "v", + "j", + "h", + "p", + "y", + "d", + "ö", + "g", + "c", + "b", + "f", + "w", + "z", + ], + ), + ( + "Vietnamese", + [ + "n", + "h", + "t", + "i", + "c", + "g", + "a", + "o", + "u", + "m", + "l", + "r", + "à", + "đ", + "s", + "e", + "v", + "p", + "b", + "y", + "ư", + "d", + "á", + "k", + "ộ", + "ế", + ], + ), + ( + "Czech", + [ + "o", + "e", + "a", + "n", + "t", + "s", + "i", + "l", + "v", + "r", + "k", + "d", + "u", + "m", + "p", + "í", + "c", + "h", + "z", + "á", + "y", + "j", + "b", + "ě", + "é", + "ř", + ], + ), + ( + "Hungarian", + [ + "e", + "a", + "t", + "l", + "s", + "n", + "k", + "r", + "i", + "o", + "z", + "á", + "é", + "g", + "m", + "b", + "y", + "v", + "d", + "h", + "u", + "p", + "j", + "ö", + "f", + "c", + ], + ), + ( + "Korean", + [ + "이", + "다", + "에", + "의", + "는", + "로", + "하", + "을", + "가", + "고", + "지", + "서", + "한", + "은", + "기", + "으", + "년", + "대", + "사", + "시", + "를", + "리", + "도", + "인", + "스", + "일", + ], + ), + ( + "Indonesian", + [ + "a", + "n", + "e", + "i", + "r", + "t", + "u", + "s", + "d", + "k", + "m", + "l", + "g", + "p", + "b", + "o", + "h", + "y", + "j", + "c", + "w", + "f", + "v", + "z", + "x", + "q", + ], + ), + ( + "Turkish", + [ + "a", + "e", + "i", + "n", + "r", + "l", + "ı", + "k", + "d", + "t", + "s", + "m", + "y", + "u", + "o", + "b", + "ü", + "ş", + "v", + "g", + "z", + "h", + "c", + "p", + "ç", + "ğ", + ], + ), + ( + "Romanian", + [ + "e", + "i", + "a", + "r", + "n", + "t", + "u", + "l", + "o", + "c", + "s", + "d", + "p", + "m", + "ă", + "f", + "v", + "î", + "g", + "b", + "ș", + "ț", + "z", + "h", + "â", + "j", + ], + ), + ( + "Farsi", + [ + "ا", + "ی", + "ر", + "د", + "ن", + "ه", + "و", + "م", + "ت", + "ب", + "س", + "ل", + "ک", + "ش", + "ز", + "ف", + "گ", + "ع", + "خ", + "ق", + "ج", + "آ", + "پ", + "ح", + "ط", + "ص", + ], + ), + ( + "Arabic", + [ + "ا", + "ل", + "ي", + "م", + "و", + "ن", + "ر", + "ت", + "ب", + "ة", + "ع", + "د", + "س", + "ف", + "ه", + "ك", + "ق", + "أ", + "ح", + "ج", + "ش", + "ط", + "ص", + "ى", + "خ", + "إ", + ], + ), + ( + "Danish", + [ + "e", + "r", + "n", + "t", + "a", + "i", + "s", + "d", + "l", + "o", + "g", + "m", + "k", + "f", + "v", + "u", + "b", + "h", + "p", + "å", + "y", + "ø", + "æ", + "c", + "j", + "w", + ], + ), + ( + "Serbian", + [ + "а", + "и", + "о", + "е", + "н", + "р", + "с", + "у", + "т", + "к", + "ј", + "в", + "д", + "м", + "п", + "л", + "г", + "з", + "б", + "a", + "i", + "e", + "o", + "n", + "ц", + "ш", + ], + ), + ( + "Lithuanian", + [ + "i", + "a", + "s", + "o", + "r", + "e", + "t", + "n", + "u", + "k", + "m", + "l", + "p", + "v", + "d", + "j", + "g", + "ė", + "b", + "y", + "ų", + "š", + "ž", + "c", + "ą", + "į", + ], + ), + ( + "Slovene", + [ + "e", + "a", + "i", + "o", + "n", + "r", + "s", + "l", + "t", + "j", + "v", + "k", + "d", + "p", + "m", + "u", + "z", + "b", + "g", + "h", + "č", + "c", + "š", + "ž", + "f", + "y", + ], + ), + ( + "Slovak", + [ + "o", + "a", + "e", + "n", + "i", + "r", + "v", + "t", + "s", + "l", + "k", + "d", + "m", + "p", + "u", + "c", + "h", + "j", + "b", + "z", + "á", + "y", + "ý", + "í", + "č", + "é", + ], + ), + ( + "Hebrew", + [ + "י", + "ו", + "ה", + "ל", + "ר", + "ב", + "ת", + "מ", + "א", + "ש", + "נ", + "ע", + "ם", + "ד", + "ק", + "ח", + "פ", + "ס", + "כ", + "ג", + "ט", + "צ", + "ן", + "ז", + "ך", + ], + ), + ( + "Bulgarian", + [ + "а", + "и", + "о", + "е", + "н", + "т", + "р", + "с", + "в", + "л", + "к", + "д", + "п", + "м", + "з", + "г", + "я", + "ъ", + "у", + "б", + "ч", + "ц", + "й", + "ж", + "щ", + "х", + ], + ), + ( + "Croatian", + [ + "a", + "i", + "o", + "e", + "n", + "r", + "j", + "s", + "t", + "u", + "k", + "l", + "v", + "d", + "m", + "p", + "g", + "z", + "b", + "c", + "č", + "h", + "š", + "ž", + "ć", + "f", + ], + ), + ( + "Hindi", + [ + "क", + "र", + "स", + "न", + "त", + "म", + "ह", + "प", + "य", + "ल", + "व", + "ज", + "द", + "ग", + "ब", + "श", + "ट", + "अ", + "ए", + "थ", + "भ", + "ड", + "च", + "ध", + "ष", + "इ", + ], + ), + ( + "Estonian", + [ + "a", + "i", + "e", + "s", + "t", + "l", + "u", + "n", + "o", + "k", + "r", + "d", + "m", + "v", + "g", + "p", + "j", + "h", + "ä", + "b", + "õ", + "ü", + "f", + "c", + "ö", + "y", + ], + ), + ( + "Simple English", + [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], + ), + ( + "Thai", + [ + "า", + "น", + "ร", + "อ", + "ก", + "เ", + "ง", + "ม", + "ย", + "ล", + "ว", + "ด", + "ท", + "ส", + "ต", + "ะ", + "ป", + "บ", + "ค", + "ห", + "แ", + "จ", + "พ", + "ช", + "ข", + "ใ", + ], + ), + ( + "Greek", + [ + "α", + "τ", + "ο", + "ι", + "ε", + "ν", + "ρ", + "σ", + "κ", + "η", + "π", + "ς", + "υ", + "μ", + "λ", + "ί", + "ό", + "ά", + "γ", + "έ", + "δ", + "ή", + "ω", + "χ", + "θ", + "ύ", + ], + ), + ( + "Tamil", + [ + "க", + "த", + "ப", + "ட", + "ர", + "ம", + "ல", + "ன", + "வ", + "ற", + "ய", + "ள", + "ச", + "ந", + "இ", + "ண", + "அ", + "ஆ", + "ழ", + "ங", + "எ", + "உ", + "ஒ", + "ஸ", + ], + ), + ( + "Classical Chinese", + [ + "之", + "年", + "為", + "也", + "以", + "一", + "人", + "其", + "者", + "國", + "有", + "二", + "十", + "於", + "曰", + "三", + "不", + "大", + "而", + "子", + "中", + "五", + "四", + ], + ), + ( + "Kazakh", + [ + "а", + "ы", + "е", + "н", + "т", + "р", + "л", + "і", + "д", + "с", + "м", + "қ", + "к", + "о", + "б", + "и", + "у", + "ғ", + "ж", + "ң", + "з", + "ш", + "й", + "п", + "г", + "ө", + ], + ), + ] ) diff --git a/pipenv/vendor/charset_normalizer/cd.py b/pipenv/vendor/charset_normalizer/cd.py index a89022e2..a4512fbb 100644 --- a/pipenv/vendor/charset_normalizer/cd.py +++ b/pipenv/vendor/charset_normalizer/cd.py @@ -1,13 +1,20 @@ -from codecs import IncrementalDecoder -from functools import lru_cache -from typing import List, Set, Optional, Tuple, Dict import importlib +from codecs import IncrementalDecoder +from collections import Counter, OrderedDict +from functools import lru_cache +from typing import Dict, List, Optional, Tuple -from pipenv.vendor.charset_normalizer.models import CoherenceMatches -from pipenv.vendor.charset_normalizer.utils import unicode_range, is_unicode_range_secondary, is_multi_byte_encoding -from pipenv.vendor.charset_normalizer.md import is_suspiciously_successive_range -from pipenv.vendor.charset_normalizer.assets import FREQUENCIES -from collections import Counter +from .assets import FREQUENCIES +from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES +from .md import is_suspiciously_successive_range +from .models import CoherenceMatches +from .utils import ( + is_accentuated, + is_latin, + is_multi_byte_encoding, + is_unicode_range_secondary, + unicode_range, +) def encoding_unicode_range(iana_name: str) -> List[str]: @@ -17,15 +24,14 @@ def encoding_unicode_range(iana_name: str) -> List[str]: if is_multi_byte_encoding(iana_name): raise IOError("Function not supported on multi-byte code page") - decoder = importlib.import_module('encodings.{}'.format(iana_name)).IncrementalDecoder # type: ignore + decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore p = decoder(errors="ignore") # type: IncrementalDecoder - seen_ranges = set() # type: Set[str] + seen_ranges = {} # type: Dict[str, int] + character_count = 0 # type: int - for i in range(48, 255): - chunk = p.decode( - bytes([i]) - ) # type: str + for i in range(0x40, 0xFF): + chunk = p.decode(bytes([i])) # type: str if chunk: character_range = unicode_range(chunk) # type: Optional[str] @@ -34,9 +40,18 @@ def encoding_unicode_range(iana_name: str) -> List[str]: continue if is_unicode_range_secondary(character_range) is False: - seen_ranges.add(character_range) + if character_range not in seen_ranges: + seen_ranges[character_range] = 0 + seen_ranges[character_range] += 1 + character_count += 1 - return sorted(list(seen_ranges)) + return sorted( + [ + character_range + for character_range in seen_ranges + if seen_ranges[character_range] / character_count >= 0.15 + ] + ) def unicode_range_languages(primary_range: str) -> List[str]: @@ -74,42 +89,78 @@ def encoding_languages(iana_name: str) -> List[str]: return unicode_range_languages(primary_range) +@lru_cache() def mb_encoding_languages(iana_name: str) -> List[str]: """ Multi-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. """ - if iana_name.startswith("shift_") or iana_name.startswith("iso2022_jp") or iana_name.startswith("euc_j") or iana_name in {"cp932"}: + if ( + iana_name.startswith("shift_") + or iana_name.startswith("iso2022_jp") + or iana_name.startswith("euc_j") + or iana_name == "cp932" + ): return ["Japanese"] - if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs"}: + if iana_name.startswith("gb") or iana_name in ZH_NAMES: return ["Chinese", "Classical Chinese"] - if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}: + if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: return ["Korean"] return [] -def alphabet_languages(characters: List[str]) -> List[str]: +def alphabet_languages( + characters: List[str], ignore_non_latin: bool = False +) -> List[str]: """ Return associated languages associated to given characters. """ - languages = [] # type: List[str] + languages = [] # type: List[Tuple[str, float]] + + source_have_accents = False # type: bool + + for character in characters: + if is_accentuated(character): + source_have_accents = True + break for language, language_characters in FREQUENCIES.items(): - character_match_count = 0 # type: int + + target_have_accents = False # type: bool + target_pure_latin = True # type: bool + + for language_character in language_characters: + if target_have_accents is False and is_accentuated(language_character): + target_have_accents = True + if target_pure_latin is True and is_latin(language_character) is False: + target_pure_latin = False + + if ignore_non_latin and target_pure_latin is False: + continue + + if target_have_accents is False and source_have_accents: + continue + character_count = len(language_characters) # type: int - for character in language_characters: - if character in characters: - character_match_count += 1 + character_match_count = len( + [c for c in language_characters if c in characters] + ) # type: int - if character_match_count / character_count >= 0.2: - languages.append(language) + ratio = character_match_count / character_count # type: float - return languages + if ratio >= 0.2: + languages.append((language, ratio)) + + languages = sorted(languages, key=lambda x: x[1], reverse=True) + + return [compatible_language[0] for compatible_language in languages] -def characters_popularity_compare(language: str, ordered_characters: List[str]) -> float: +def characters_popularity_compare( + language: str, ordered_characters: List[str] +) -> float: """ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). @@ -124,14 +175,30 @@ def characters_popularity_compare(language: str, ordered_characters: List[str]) if character not in FREQUENCIES[language]: continue - characters_before_source = FREQUENCIES[language][0:FREQUENCIES[language].index(character)] # type: List[str] - characters_after_source = FREQUENCIES[language][FREQUENCIES[language].index(character):] # type: List[str] + characters_before_source = FREQUENCIES[language][ + 0 : FREQUENCIES[language].index(character) + ] # type: List[str] + characters_after_source = FREQUENCIES[language][ + FREQUENCIES[language].index(character) : + ] # type: List[str] - characters_before = ordered_characters[0:ordered_characters.index(character)] # type: List[str] - characters_after = ordered_characters[ordered_characters.index(character):] # type: List[str] + characters_before = ordered_characters[ + 0 : ordered_characters.index(character) + ] # type: List[str] + characters_after = ordered_characters[ + ordered_characters.index(character) : + ] # type: List[str] - before_match_count = [e in characters_before for e in characters_before_source].count(True) # type: int - after_match_count = [e in characters_after for e in characters_after_source].count(True) # type: int + before_match_count = [ + e in characters_before for e in characters_before_source + ].count( + True + ) # type: int + after_match_count = [ + e in characters_after for e in characters_after_source + ].count( + True + ) # type: int if len(characters_before_source) == 0 and before_match_count <= 4: character_approved_count += 1 @@ -141,7 +208,10 @@ def characters_popularity_compare(language: str, ordered_characters: List[str]) character_approved_count += 1 continue - if before_match_count / len(characters_before_source) >= 0.4 or after_match_count / len(characters_after_source) >= 0.4: + if ( + before_match_count / len(characters_before_source) >= 0.4 + or after_match_count / len(characters_after_source) >= 0.4 + ): character_approved_count += 1 continue @@ -154,18 +224,24 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]: Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; One containing the latin letters and the other hebrew. """ - layers = {} # type: Dict[str, str] + layers = OrderedDict() # type: Dict[str, str] for character in decoded_sequence: if character.isalpha() is False: continue - character_range = unicode_range(character) # type: str + character_range = unicode_range(character) # type: Optional[str] + + if character_range is None: + continue layer_target_range = None # type: Optional[str] for discovered_range in layers: - if is_suspiciously_successive_range(discovered_range, character_range) is False: + if ( + is_suspiciously_successive_range(discovered_range, character_range) + is False + ): layer_target_range = discovered_range break @@ -186,7 +262,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: This function merge results previously given by the function coherence_ratio. The return type is the same as coherence_ratio. """ - per_language_ratios = {} # type: Dict[str, List[float]] + per_language_ratios = OrderedDict() # type: Dict[str, List[float]] merge = [] # type: CoherenceMatches for result in results: @@ -195,20 +271,17 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: if language not in per_language_ratios: per_language_ratios[language] = [ratio] continue - per_language_ratios[language].append( - ratio - ) + per_language_ratios[language].append(ratio) for language in per_language_ratios: merge.append( ( language, round( - sum( - per_language_ratios[language] - ) / len(per_language_ratios[language]), - 4 - ) + sum(per_language_ratios[language]) + / len(per_language_ratios[language]), + 4, + ), ) ) @@ -216,21 +289,26 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: @lru_cache(maxsize=2048) -def coherence_ratio(decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None) -> CoherenceMatches: +def coherence_ratio( + decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None +) -> CoherenceMatches: """ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. A layer = Character extraction by alphabets/ranges. """ results = [] # type: List[Tuple[str, float]] + lg_inclusion_list = [] # type: List[str] + ignore_non_latin = False # type: bool sufficient_match_count = 0 # type: int if lg_inclusion is not None: - lg_inclusion = lg_inclusion.split(",") + lg_inclusion_list = lg_inclusion.split(",") - if lg_inclusion is not None and "Latin Based" in lg_inclusion: - lg_inclusion.remove("Latin Based") + if "Latin Based" in lg_inclusion_list: + ignore_non_latin = True + lg_inclusion_list.remove("Latin Based") for layer in alpha_unicode_split(decoded_sequence): sequence_frequencies = Counter(layer) # type: Counter @@ -238,22 +316,24 @@ def coherence_ratio(decoded_sequence: str, threshold: float = 0.1, lg_inclusion: character_count = sum([o for c, o in most_common]) # type: int - if character_count <= 32: + if character_count <= TOO_SMALL_SEQUENCE: continue popular_character_ordered = [c for c, o in most_common] # type: List[str] - for language in lg_inclusion or alphabet_languages(popular_character_ordered): - ratio = characters_popularity_compare(language, popular_character_ordered) # type: float + for language in lg_inclusion_list or alphabet_languages( + popular_character_ordered, ignore_non_latin + ): + ratio = characters_popularity_compare( + language, popular_character_ordered + ) # type: float if ratio < threshold: continue elif ratio >= 0.8: sufficient_match_count += 1 - results.append( - (language, round(ratio, 4)) - ) + results.append((language, round(ratio, 4))) if sufficient_match_count >= 3: break diff --git a/pipenv/vendor/charset_normalizer/cli/normalizer.py b/pipenv/vendor/charset_normalizer/cli/normalizer.py index 26c94b7c..f1911259 100644 --- a/pipenv/vendor/charset_normalizer/cli/normalizer.py +++ b/pipenv/vendor/charset_normalizer/cli/normalizer.py @@ -1,16 +1,16 @@ import argparse import sys -from os.path import abspath from json import dumps - -from pipenv.vendor.charset_normalizer import from_fp -from pipenv.vendor.charset_normalizer.models import CliDetectionResult -from pipenv.vendor.charset_normalizer.version import __version__ - +from os.path import abspath from platform import python_version +from typing import List + +from charset_normalizer import from_fp +from charset_normalizer.models import CliDetectionResult +from charset_normalizer.version import __version__ -def query_yes_no(question, default="yes"): +def query_yes_no(question: str, default: str = "yes") -> bool: """Ask a yes/no question via input() and return their answer. "question" is a string that is presented to the user. @@ -22,8 +22,7 @@ def query_yes_no(question, default="yes"): Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input """ - valid = {"yes": True, "y": True, "ye": True, - "no": False, "n": False} + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} if default is None: prompt = " [y/n] " elif default == "yes": @@ -36,16 +35,15 @@ def query_yes_no(question, default="yes"): while True: sys.stdout.write(question + prompt) choice = input().lower() - if default is not None and choice == '': + if default is not None and choice == "": return valid[default] elif choice in valid: return valid[choice] else: - sys.stdout.write("Please respond with 'yes' or 'no' " - "(or 'y' or 'n').\n") + sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") -def cli_detect(argv=None): +def cli_detect(argv: List[str] = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: @@ -53,133 +51,215 @@ def cli_detect(argv=None): """ parser = argparse.ArgumentParser( description="The Real First Universal Charset Detector. " - "Discover originating encoding used on text file. " - "Normalize text to unicode." + "Discover originating encoding used on text file. " + "Normalize text to unicode." ) - parser.add_argument('files', type=argparse.FileType('rb'), nargs='+', help='File(s) to be analysed') - parser.add_argument('-v', '--verbose', action="store_true", default=False, dest='verbose', - help='Display complementary information about file if any. Stdout will contain logs about the detection process.') - parser.add_argument('-a', '--with-alternative', action="store_true", default=False, dest='alternatives', - help='Output complementary possibilities if any. Top-level JSON WILL be a list.') - parser.add_argument('-n', '--normalize', action="store_true", default=False, dest='normalize', - help='Permit to normalize input file. If not set, program does not write anything.') - parser.add_argument('-m', '--minimal', action="store_true", default=False, dest='minimal', - help='Only output the charset detected to STDOUT. Disabling JSON output.') - parser.add_argument('-r', '--replace', action="store_true", default=False, dest='replace', - help='Replace file when trying to normalize it instead of creating a new one.') - parser.add_argument('-f', '--force', action="store_true", default=False, dest='force', - help='Replace file without asking if you are sure, use this flag with caution.') - parser.add_argument('-t', '--threshold', action="store", default=0.1, type=float, dest='threshold', - help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.") + parser.add_argument( + "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed" + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + dest="verbose", + help="Display complementary information about file if any. " + "Stdout will contain logs about the detection process.", + ) + parser.add_argument( + "-a", + "--with-alternative", + action="store_true", + default=False, + dest="alternatives", + help="Output complementary possibilities if any. Top-level JSON WILL be a list.", + ) + parser.add_argument( + "-n", + "--normalize", + action="store_true", + default=False, + dest="normalize", + help="Permit to normalize input file. If not set, program does not write anything.", + ) + parser.add_argument( + "-m", + "--minimal", + action="store_true", + default=False, + dest="minimal", + help="Only output the charset detected to STDOUT. Disabling JSON output.", + ) + parser.add_argument( + "-r", + "--replace", + action="store_true", + default=False, + dest="replace", + help="Replace file when trying to normalize it instead of creating a new one.", + ) + parser.add_argument( + "-f", + "--force", + action="store_true", + default=False, + dest="force", + help="Replace file without asking if you are sure, use this flag with caution.", + ) + parser.add_argument( + "-t", + "--threshold", + action="store", + default=0.1, + type=float, + dest="threshold", + help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", + ) parser.add_argument( "--version", action="version", - version="Charset-Normalizer {} - Python {}".format(__version__, python_version()), - help="Show version information and exit." + version="Charset-Normalizer {} - Python {}".format( + __version__, python_version() + ), + help="Show version information and exit.", ) args = parser.parse_args(argv) if args.replace is True and args.normalize is False: - print('Use --replace in addition of --normalize only.', file=sys.stderr) + print("Use --replace in addition of --normalize only.", file=sys.stderr) return 1 if args.force is True and args.replace is False: - print('Use --force in addition of --replace only.', file=sys.stderr) + print("Use --force in addition of --replace only.", file=sys.stderr) return 1 - if args.threshold < 0. or args.threshold > 1.: - print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr) + if args.threshold < 0.0 or args.threshold > 1.0: + print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) return 1 + x_ = [] + for my_file in args.files: - matches = from_fp( - my_file, - threshold=args.threshold, - explain=args.verbose - ) + matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose) - if len(matches) == 0: - print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr) - if my_file.closed is False: - my_file.close() - continue + best_guess = matches.best() - x_ = [] - - r_ = matches.best() - p_ = r_.first() - - x_.append( - CliDetectionResult( - abspath(my_file.name), - p_.encoding, - p_.encoding_aliases, - [cp for cp in p_.could_be_from_charset if cp != p_.encoding], - p_.language, - p_.alphabets, - p_.bom, - p_.percent_chaos, - p_.percent_coherence, - None, - True + if best_guess is None: + print( + 'Unable to identify originating encoding for "{}". {}'.format( + my_file.name, + "Maybe try increasing maximum amount of chaos." + if args.threshold < 1.0 + else "", + ), + file=sys.stderr, + ) + x_.append( + CliDetectionResult( + abspath(my_file.name), + None, + [], + [], + "Unknown", + [], + False, + 1.0, + 0.0, + None, + True, + ) + ) + else: + x_.append( + CliDetectionResult( + abspath(my_file.name), + best_guess.encoding, + best_guess.encoding_aliases, + [ + cp + for cp in best_guess.could_be_from_charset + if cp != best_guess.encoding + ], + best_guess.language, + best_guess.alphabets, + best_guess.bom, + best_guess.percent_chaos, + best_guess.percent_coherence, + None, + True, + ) ) - ) - if len(matches) > 1 and args.alternatives: - for el in matches: - if el != p_: - x_.append( - CliDetectionResult( - abspath(my_file.name), - el.encoding, - el.encoding_aliases, - [cp for cp in el.could_be_from_charset if cp != el.encoding], - el.language, - el.alphabets, - el.bom, - el.percent_chaos, - el.percent_coherence, - None, - False + if len(matches) > 1 and args.alternatives: + for el in matches: + if el != best_guess: + x_.append( + CliDetectionResult( + abspath(my_file.name), + el.encoding, + el.encoding_aliases, + [ + cp + for cp in el.could_be_from_charset + if cp != el.encoding + ], + el.language, + el.alphabets, + el.bom, + el.percent_chaos, + el.percent_coherence, + None, + False, + ) ) + + if args.normalize is True: + + if best_guess.encoding.startswith("utf") is True: + print( + '"{}" file does not need to be normalized, as it already came from unicode.'.format( + my_file.name + ), + file=sys.stderr, ) - - if args.normalize is True: - - if p_.encoding.startswith('utf') is True: - print('"{}" file does not need to be normalized, as it already came from unicode.'.format(my_file.name), file=sys.stderr) - if my_file.closed is False: - my_file.close() - continue - - o_ = my_file.name.split('.') # type: list[str] - - if args.replace is False: - o_.insert(-1, p_.encoding) - if my_file.closed is False: - my_file.close() - else: - if args.force is False and query_yes_no( - 'Are you sure to normalize "{}" by replacing it ?'.format(my_file.name), 'no') is False: if my_file.closed is False: my_file.close() continue - try: - x_[0].unicode_path = './{}'.format('.'.join(o_)) + o_ = my_file.name.split(".") # type: List[str] - with open(x_[0].unicode_path, 'w', encoding='utf-8') as fp: - fp.write( - str(p_) - ) - except IOError as e: - print(str(e), file=sys.stderr) - if my_file.closed is False: - my_file.close() - return 2 + if args.replace is False: + o_.insert(-1, best_guess.encoding) + if my_file.closed is False: + my_file.close() + else: + if ( + args.force is False + and query_yes_no( + 'Are you sure to normalize "{}" by replacing it ?'.format( + my_file.name + ), + "no", + ) + is False + ): + if my_file.closed is False: + my_file.close() + continue + + try: + x_[0].unicode_path = abspath("./{}".format(".".join(o_))) + + with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: + fp.write(str(best_guess)) + except IOError as e: + print(str(e), file=sys.stderr) + if my_file.closed is False: + my_file.close() + return 2 if my_file.closed is False: my_file.close() @@ -187,24 +267,25 @@ def cli_detect(argv=None): if args.minimal is False: print( dumps( - [ - el.__dict__ for el in x_ - ] if args.alternatives else x_[0].__dict__, + [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, ensure_ascii=True, - indent=4 + indent=4, ) ) else: - print( - ', '.join( - [ - el.encoding for el in x_ - ] + for my_file in args.files: + print( + ", ".join( + [ + el.encoding if el.encoding else "undefined" + for el in x_ + if el.path == abspath(my_file.name) + ] + ) ) - ) return 0 -if __name__ == '__main__': +if __name__ == "__main__": cli_detect() diff --git a/pipenv/vendor/charset_normalizer/constant.py b/pipenv/vendor/charset_normalizer/constant.py index c9c96555..2e5974d9 100644 --- a/pipenv/vendor/charset_normalizer/constant.py +++ b/pipenv/vendor/charset_normalizer/constant.py @@ -1,64 +1,344 @@ from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE -from typing import Dict, List, Union -from encodings.aliases import aliases -from re import compile as re_compile, IGNORECASE from collections import OrderedDict +from encodings.aliases import aliases +from re import IGNORECASE, compile as re_compile +from typing import Dict, List, Set, Union # Contain for each eligible encoding a list of/item bytes SIG/BOM -ENCODING_MARKS = OrderedDict([ - ('utf_8', BOM_UTF8), - ('utf_7', [ - b'\x2b\x2f\x76\x38', - b'\x2b\x2f\x76\x39', - b'\x2b\x2f\x76\x2b', - b'\x2b\x2f\x76\x2f', - b'\x2b\x2f\x76\x38\x2d' - ]), - ('gb18030', b'\x84\x31\x95\x33'), - ('utf_32', [ - BOM_UTF32_BE, - BOM_UTF32_LE - ]), - ('utf_16', [ - BOM_UTF16_BE, - BOM_UTF16_LE - ]), -]) # type: Dict[str, Union[bytes, List[bytes]]] +ENCODING_MARKS = OrderedDict( + [ + ("utf_8", BOM_UTF8), + ( + "utf_7", + [ + b"\x2b\x2f\x76\x38", + b"\x2b\x2f\x76\x39", + b"\x2b\x2f\x76\x2b", + b"\x2b\x2f\x76\x2f", + b"\x2b\x2f\x76\x38\x2d", + ], + ), + ("gb18030", b"\x84\x31\x95\x33"), + ("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]), + ("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]), + ] +) # type: Dict[str, Union[bytes, List[bytes]]] TOO_SMALL_SEQUENCE = 32 # type: int TOO_BIG_SEQUENCE = int(10e6) # type: int UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int -UNICODE_RANGES_COMBINED = {'Control character': range(0, 31+1), 'Basic Latin': range(32, 127+1), 'Latin-1 Supplement': range(128, 255+1), 'Latin Extended-A': range(256, 383+1), 'Latin Extended-B': range(384, 591+1), 'IPA Extensions': range(592, 687+1), 'Spacing Modifier Letters': range(688, 767+1), 'Combining Diacritical Marks': range(768, 879+1), 'Greek and Coptic': range(880, 1023+1), 'Cyrillic': range(1024, 1279+1), 'Cyrillic Supplement': range(1280, 1327+1), 'Armenian': range(1328, 1423+1), 'Hebrew': range(1424, 1535+1), 'Arabic': range(1536, 1791+1), 'Syriac': range(1792, 1871+1), 'Arabic Supplement': range(1872, 1919+1), 'Thaana': range(1920, 1983+1), 'NKo': range(1984, 2047+1), 'Samaritan': range(2048, 2111+1), 'Mandaic': range(2112, 2143+1), 'Syriac Supplement': range(2144, 2159+1), 'Arabic Extended-A': range(2208, 2303+1), 'Devanagari': range(2304, 2431+1), 'Bengali': range(2432, 2559+1), 'Gurmukhi': range(2560, 2687+1), 'Gujarati': range(2688, 2815+1), 'Oriya': range(2816, 2943+1), 'Tamil': range(2944, 3071+1), 'Telugu': range(3072, 3199+1), 'Kannada': range(3200, 3327+1), 'Malayalam': range(3328, 3455+1), 'Sinhala': range(3456, 3583+1), 'Thai': range(3584, 3711+1), 'Lao': range(3712, 3839+1), 'Tibetan': range(3840, 4095+1), 'Myanmar': range(4096, 4255+1), 'Georgian': range(4256, 4351+1), 'Hangul Jamo': range(4352, 4607+1), 'Ethiopic': range(4608, 4991+1), 'Ethiopic Supplement': range(4992, 5023+1), 'Cherokee': range(5024, 5119+1), 'Unified Canadian Aboriginal Syllabics': range(5120, 5759+1), 'Ogham': range(5760, 5791+1), 'Runic': range(5792, 5887+1), 'Tagalog': range(5888, 5919+1), 'Hanunoo': range(5920, 5951+1), 'Buhid': range(5952, 5983+1), 'Tagbanwa': range(5984, 6015+1), 'Khmer': range(6016, 6143+1), 'Mongolian': range(6144, 6319+1), 'Unified Canadian Aboriginal Syllabics Extended': range(6320, 6399+1), 'Limbu': range(6400, 6479+1), 'Tai Le': range(6480, 6527+1), 'New Tai Lue': range(6528, 6623+1), 'Khmer Symbols': range(6624, 6655+1), 'Buginese': range(6656, 6687+1), 'Tai Tham': range(6688, 6831+1), 'Combining Diacritical Marks Extended': range(6832, 6911+1), 'Balinese': range(6912, 7039+1), 'Sundanese': range(7040, 7103+1), 'Batak': range(7104, 7167+1), 'Lepcha': range(7168, 7247+1), 'Ol Chiki': range(7248, 7295+1), 'Cyrillic Extended C': range(7296, 7311+1), 'Sundanese Supplement': range(7360, 7375+1), 'Vedic Extensions': range(7376, 7423+1), 'Phonetic Extensions': range(7424, 7551+1), 'Phonetic Extensions Supplement': range(7552, 7615+1), 'Combining Diacritical Marks Supplement': range(7616, 7679+1), 'Latin Extended Additional': range(7680, 7935+1), 'Greek Extended': range(7936, 8191+1), 'General Punctuation': range(8192, 8303+1), 'Superscripts and Subscripts': range(8304, 8351+1), 'Currency Symbols': range(8352, 8399+1), 'Combining Diacritical Marks for Symbols': range(8400, 8447+1), 'Letterlike Symbols': range(8448, 8527+1), 'Number Forms': range(8528, 8591+1), 'Arrows': range(8592, 8703+1), 'Mathematical Operators': range(8704, 8959+1), 'Miscellaneous Technical': range(8960, 9215+1), 'Control Pictures': range(9216, 9279+1), 'Optical Character Recognition': range(9280, 9311+1), 'Enclosed Alphanumerics': range(9312, 9471+1), 'Box Drawing': range(9472, 9599+1), 'Block Elements': range(9600, 9631+1), 'Geometric Shapes': range(9632, 9727+1), 'Miscellaneous Symbols': range(9728, 9983+1), 'Dingbats': range(9984, 10175+1), 'Miscellaneous Mathematical Symbols-A': range(10176, 10223+1), 'Supplemental Arrows-A': range(10224, 10239+1), 'Braille Patterns': range(10240, 10495+1), 'Supplemental Arrows-B': range(10496, 10623+1), 'Miscellaneous Mathematical Symbols-B': range(10624, 10751+1), 'Supplemental Mathematical Operators': range(10752, 11007+1), 'Miscellaneous Symbols and Arrows': range(11008, 11263+1), 'Glagolitic': range(11264, 11359+1), 'Latin Extended-C': range(11360, 11391+1), 'Coptic': range(11392, 11519+1), 'Georgian Supplement': range(11520, 11567+1), 'Tifinagh': range(11568, 11647+1), 'Ethiopic Extended': range(11648, 11743+1), 'Cyrillic Extended-A': range(11744, 11775+1), 'Supplemental Punctuation': range(11776, 11903+1), 'CJK Radicals Supplement': range(11904, 12031+1), 'Kangxi Radicals': range(12032, 12255+1), 'Ideographic Description Characters': range(12272, 12287+1), 'CJK Symbols and Punctuation': range(12288, 12351+1), 'Hiragana': range(12352, 12447+1), 'Katakana': range(12448, 12543+1), 'Bopomofo': range(12544, 12591+1), 'Hangul Compatibility Jamo': range(12592, 12687+1), 'Kanbun': range(12688, 12703+1), 'Bopomofo Extended': range(12704, 12735+1), 'CJK Strokes': range(12736, 12783+1), 'Katakana Phonetic Extensions': range(12784, 12799+1), 'Enclosed CJK Letters and Months': range(12800, 13055+1), 'CJK Compatibility': range(13056, 13311+1), 'CJK Unified Ideographs Extension A': range(13312, 19903+1), 'Yijing Hexagram Symbols': range(19904, 19967+1), 'CJK Unified Ideographs': range(19968, 40959+1), 'Yi Syllables': range(40960, 42127+1), 'Yi Radicals': range(42128, 42191+1), 'Lisu': range(42192, 42239+1), 'Vai': range(42240, 42559+1), 'Cyrillic Extended-B': range(42560, 42655+1), 'Bamum': range(42656, 42751+1), 'Modifier Tone Letters': range(42752, 42783+1), 'Latin Extended-D': range(42784, 43007+1), 'Syloti Nagri': range(43008, 43055+1), 'Common Indic Number Forms': range(43056, 43071+1), 'Phags-pa': range(43072, 43135+1), 'Saurashtra': range(43136, 43231+1), 'Devanagari Extended': range(43232, 43263+1), 'Kayah Li': range(43264, 43311+1), 'Rejang': range(43312, 43359+1), 'Hangul Jamo Extended-A': range(43360, 43391+1), 'Javanese': range(43392, 43487+1), 'Myanmar Extended-B': range(43488, 43519+1), 'Cham': range(43520, 43615+1), 'Myanmar Extended-A': range(43616, 43647+1), 'Tai Viet': range(43648, 43743+1), 'Meetei Mayek Extensions': range(43744, 43775+1), 'Ethiopic Extended-A': range(43776, 43823+1), 'Latin Extended-E': range(43824, 43887+1), 'Cherokee Supplement': range(43888, 43967+1), 'Meetei Mayek': range(43968, 44031+1), 'Hangul Syllables': range(44032, 55215+1), 'Hangul Jamo Extended-B': range(55216, 55295+1), 'High Surrogates': range(55296, 56191+1), 'High Private Use Surrogates': range(56192, 56319+1), 'Low Surrogates': range(56320, 57343+1), 'Private Use Area': range(57344, 63743+1), 'CJK Compatibility Ideographs': range(63744, 64255+1), 'Alphabetic Presentation Forms': range(64256, 64335+1), 'Arabic Presentation Forms-A': range(64336, 65023+1), 'Variation Selectors': range(65024, 65039+1), 'Vertical Forms': range(65040, 65055+1), 'Combining Half Marks': range(65056, 65071+1), 'CJK Compatibility Forms': range(65072, 65103+1), 'Small Form Variants': range(65104, 65135+1), 'Arabic Presentation Forms-B': range(65136, 65279+1), 'Halfwidth and Fullwidth Forms': range(65280, 65519+1), 'Specials': range(65520, 65535+1), 'Linear B Syllabary': range(65536, 65663+1), 'Linear B Ideograms': range(65664, 65791+1), 'Aegean Numbers': range(65792, 65855+1), 'Ancient Greek Numbers': range(65856, 65935+1), 'Ancient Symbols': range(65936, 65999+1), 'Phaistos Disc': range(66000, 66047+1), 'Lycian': range(66176, 66207+1), 'Carian': range(66208, 66271+1), 'Coptic Epact Numbers': range(66272, 66303+1), 'Old Italic': range(66304, 66351+1), 'Gothic': range(66352, 66383+1), 'Old Permic': range(66384, 66431+1), 'Ugaritic': range(66432, 66463+1), 'Old Persian': range(66464, 66527+1), 'Deseret': range(66560, 66639+1), 'Shavian': range(66640, 66687+1), 'Osmanya': range(66688, 66735+1), 'Osage': range(66736, 66815+1), 'Elbasan': range(66816, 66863+1), 'Caucasian Albanian': range(66864, 66927+1), 'Linear A': range(67072, 67455+1), 'Cypriot Syllabary': range(67584, 67647+1), 'Imperial Aramaic': range(67648, 67679+1), 'Palmyrene': range(67680, 67711+1), 'Nabataean': range(67712, 67759+1), 'Hatran': range(67808, 67839+1), 'Phoenician': range(67840, 67871+1), 'Lydian': range(67872, 67903+1), 'Meroitic Hieroglyphs': range(67968, 67999+1), 'Meroitic Cursive': range(68000, 68095+1), 'Kharoshthi': range(68096, 68191+1), 'Old South Arabian': range(68192, 68223+1), 'Old North Arabian': range(68224, 68255+1), 'Manichaean': range(68288, 68351+1), 'Avestan': range(68352, 68415+1), 'Inscriptional Parthian': range(68416, 68447+1), 'Inscriptional Pahlavi': range(68448, 68479+1), 'Psalter Pahlavi': range(68480, 68527+1), 'Old Turkic': range(68608, 68687+1), 'Old Hungarian': range(68736, 68863+1), 'Rumi Numeral Symbols': range(69216, 69247+1), 'Brahmi': range(69632, 69759+1), 'Kaithi': range(69760, 69839+1), 'Sora Sompeng': range(69840, 69887+1), 'Chakma': range(69888, 69967+1), 'Mahajani': range(69968, 70015+1), 'Sharada': range(70016, 70111+1), 'Sinhala Archaic Numbers': range(70112, 70143+1), 'Khojki': range(70144, 70223+1), 'Multani': range(70272, 70319+1), 'Khudawadi': range(70320, 70399+1), 'Grantha': range(70400, 70527+1), 'Newa': range(70656, 70783+1), 'Tirhuta': range(70784, 70879+1), 'Siddham': range(71040, 71167+1), 'Modi': range(71168, 71263+1), 'Mongolian Supplement': range(71264, 71295+1), 'Takri': range(71296, 71375+1), 'Ahom': range(71424, 71487+1), 'Warang Citi': range(71840, 71935+1), 'Zanabazar Square': range(72192, 72271+1), 'Soyombo': range(72272, 72367+1), 'Pau Cin Hau': range(72384, 72447+1), 'Bhaiksuki': range(72704, 72815+1), 'Marchen': range(72816, 72895+1), 'Masaram Gondi': range(72960, 73055+1), 'Cuneiform': range(73728, 74751+1), 'Cuneiform Numbers and Punctuation': range(74752, 74879+1), 'Early Dynastic Cuneiform': range(74880, 75087+1), 'Egyptian Hieroglyphs': range(77824, 78895+1), 'Anatolian Hieroglyphs': range(82944, 83583+1), 'Bamum Supplement': range(92160, 92735+1), 'Mro': range(92736, 92783+1), 'Bassa Vah': range(92880, 92927+1), 'Pahawh Hmong': range(92928, 93071+1), 'Miao': range(93952, 94111+1), 'Ideographic Symbols and Punctuation': range(94176, 94207+1), 'Tangut': range(94208, 100351+1), 'Tangut Components': range(100352, 101119+1), 'Kana Supplement': range(110592, 110847+1), 'Kana Extended-A': range(110848, 110895+1), 'Nushu': range(110960, 111359+1), 'Duployan': range(113664, 113823+1), 'Shorthand Format Controls': range(113824, 113839+1), 'Byzantine Musical Symbols': range(118784, 119039+1), 'Musical Symbols': range(119040, 119295+1), 'Ancient Greek Musical Notation': range(119296, 119375+1), 'Tai Xuan Jing Symbols': range(119552, 119647+1), 'Counting Rod Numerals': range(119648, 119679+1), 'Mathematical Alphanumeric Symbols': range(119808, 120831+1), 'Sutton SignWriting': range(120832, 121519+1), 'Glagolitic Supplement': range(122880, 122927+1), 'Mende Kikakui': range(124928, 125151+1), 'Adlam': range(125184, 125279+1), 'Arabic Mathematical Alphabetic Symbols': range(126464, 126719+1), 'Mahjong Tiles': range(126976, 127023+1), 'Domino Tiles': range(127024, 127135+1), 'Playing Cards': range(127136, 127231+1), 'Enclosed Alphanumeric Supplement': range(127232, 127487+1), 'Enclosed Ideographic Supplement': range(127488, 127743+1), 'Miscellaneous Symbols and Pictographs': range(127744, 128511+1), 'Emoticons range(Emoji)': range(128512, 128591+1), 'Ornamental Dingbats': range(128592, 128639+1), 'Transport and Map Symbols': range(128640, 128767+1), 'Alchemical Symbols': range(128768, 128895+1), 'Geometric Shapes Extended': range(128896, 129023+1), 'Supplemental Arrows-C': range(129024, 129279+1), 'Supplemental Symbols and Pictographs': range(129280, 129535+1), 'CJK Unified Ideographs Extension B': range(131072, 173791+1), 'CJK Unified Ideographs Extension C': range(173824, 177983+1), 'CJK Unified Ideographs Extension D': range(177984, 178207+1), 'CJK Unified Ideographs Extension E': range(178208, 183983+1), 'CJK Unified Ideographs Extension F': range(183984, 191471+1), 'CJK Compatibility Ideographs Supplement': range(194560, 195103+1), 'Tags': range(917504, 917631+1), 'Variation Selectors Supplement': range(917760, 917999+1)} # type: Dict[str, range] +UNICODE_RANGES_COMBINED = { + "Control character": range(0, 31 + 1), + "Basic Latin": range(32, 127 + 1), + "Latin-1 Supplement": range(128, 255 + 1), + "Latin Extended-A": range(256, 383 + 1), + "Latin Extended-B": range(384, 591 + 1), + "IPA Extensions": range(592, 687 + 1), + "Spacing Modifier Letters": range(688, 767 + 1), + "Combining Diacritical Marks": range(768, 879 + 1), + "Greek and Coptic": range(880, 1023 + 1), + "Cyrillic": range(1024, 1279 + 1), + "Cyrillic Supplement": range(1280, 1327 + 1), + "Armenian": range(1328, 1423 + 1), + "Hebrew": range(1424, 1535 + 1), + "Arabic": range(1536, 1791 + 1), + "Syriac": range(1792, 1871 + 1), + "Arabic Supplement": range(1872, 1919 + 1), + "Thaana": range(1920, 1983 + 1), + "NKo": range(1984, 2047 + 1), + "Samaritan": range(2048, 2111 + 1), + "Mandaic": range(2112, 2143 + 1), + "Syriac Supplement": range(2144, 2159 + 1), + "Arabic Extended-A": range(2208, 2303 + 1), + "Devanagari": range(2304, 2431 + 1), + "Bengali": range(2432, 2559 + 1), + "Gurmukhi": range(2560, 2687 + 1), + "Gujarati": range(2688, 2815 + 1), + "Oriya": range(2816, 2943 + 1), + "Tamil": range(2944, 3071 + 1), + "Telugu": range(3072, 3199 + 1), + "Kannada": range(3200, 3327 + 1), + "Malayalam": range(3328, 3455 + 1), + "Sinhala": range(3456, 3583 + 1), + "Thai": range(3584, 3711 + 1), + "Lao": range(3712, 3839 + 1), + "Tibetan": range(3840, 4095 + 1), + "Myanmar": range(4096, 4255 + 1), + "Georgian": range(4256, 4351 + 1), + "Hangul Jamo": range(4352, 4607 + 1), + "Ethiopic": range(4608, 4991 + 1), + "Ethiopic Supplement": range(4992, 5023 + 1), + "Cherokee": range(5024, 5119 + 1), + "Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1), + "Ogham": range(5760, 5791 + 1), + "Runic": range(5792, 5887 + 1), + "Tagalog": range(5888, 5919 + 1), + "Hanunoo": range(5920, 5951 + 1), + "Buhid": range(5952, 5983 + 1), + "Tagbanwa": range(5984, 6015 + 1), + "Khmer": range(6016, 6143 + 1), + "Mongolian": range(6144, 6319 + 1), + "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1), + "Limbu": range(6400, 6479 + 1), + "Tai Le": range(6480, 6527 + 1), + "New Tai Lue": range(6528, 6623 + 1), + "Khmer Symbols": range(6624, 6655 + 1), + "Buginese": range(6656, 6687 + 1), + "Tai Tham": range(6688, 6831 + 1), + "Combining Diacritical Marks Extended": range(6832, 6911 + 1), + "Balinese": range(6912, 7039 + 1), + "Sundanese": range(7040, 7103 + 1), + "Batak": range(7104, 7167 + 1), + "Lepcha": range(7168, 7247 + 1), + "Ol Chiki": range(7248, 7295 + 1), + "Cyrillic Extended C": range(7296, 7311 + 1), + "Sundanese Supplement": range(7360, 7375 + 1), + "Vedic Extensions": range(7376, 7423 + 1), + "Phonetic Extensions": range(7424, 7551 + 1), + "Phonetic Extensions Supplement": range(7552, 7615 + 1), + "Combining Diacritical Marks Supplement": range(7616, 7679 + 1), + "Latin Extended Additional": range(7680, 7935 + 1), + "Greek Extended": range(7936, 8191 + 1), + "General Punctuation": range(8192, 8303 + 1), + "Superscripts and Subscripts": range(8304, 8351 + 1), + "Currency Symbols": range(8352, 8399 + 1), + "Combining Diacritical Marks for Symbols": range(8400, 8447 + 1), + "Letterlike Symbols": range(8448, 8527 + 1), + "Number Forms": range(8528, 8591 + 1), + "Arrows": range(8592, 8703 + 1), + "Mathematical Operators": range(8704, 8959 + 1), + "Miscellaneous Technical": range(8960, 9215 + 1), + "Control Pictures": range(9216, 9279 + 1), + "Optical Character Recognition": range(9280, 9311 + 1), + "Enclosed Alphanumerics": range(9312, 9471 + 1), + "Box Drawing": range(9472, 9599 + 1), + "Block Elements": range(9600, 9631 + 1), + "Geometric Shapes": range(9632, 9727 + 1), + "Miscellaneous Symbols": range(9728, 9983 + 1), + "Dingbats": range(9984, 10175 + 1), + "Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1), + "Supplemental Arrows-A": range(10224, 10239 + 1), + "Braille Patterns": range(10240, 10495 + 1), + "Supplemental Arrows-B": range(10496, 10623 + 1), + "Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1), + "Supplemental Mathematical Operators": range(10752, 11007 + 1), + "Miscellaneous Symbols and Arrows": range(11008, 11263 + 1), + "Glagolitic": range(11264, 11359 + 1), + "Latin Extended-C": range(11360, 11391 + 1), + "Coptic": range(11392, 11519 + 1), + "Georgian Supplement": range(11520, 11567 + 1), + "Tifinagh": range(11568, 11647 + 1), + "Ethiopic Extended": range(11648, 11743 + 1), + "Cyrillic Extended-A": range(11744, 11775 + 1), + "Supplemental Punctuation": range(11776, 11903 + 1), + "CJK Radicals Supplement": range(11904, 12031 + 1), + "Kangxi Radicals": range(12032, 12255 + 1), + "Ideographic Description Characters": range(12272, 12287 + 1), + "CJK Symbols and Punctuation": range(12288, 12351 + 1), + "Hiragana": range(12352, 12447 + 1), + "Katakana": range(12448, 12543 + 1), + "Bopomofo": range(12544, 12591 + 1), + "Hangul Compatibility Jamo": range(12592, 12687 + 1), + "Kanbun": range(12688, 12703 + 1), + "Bopomofo Extended": range(12704, 12735 + 1), + "CJK Strokes": range(12736, 12783 + 1), + "Katakana Phonetic Extensions": range(12784, 12799 + 1), + "Enclosed CJK Letters and Months": range(12800, 13055 + 1), + "CJK Compatibility": range(13056, 13311 + 1), + "CJK Unified Ideographs Extension A": range(13312, 19903 + 1), + "Yijing Hexagram Symbols": range(19904, 19967 + 1), + "CJK Unified Ideographs": range(19968, 40959 + 1), + "Yi Syllables": range(40960, 42127 + 1), + "Yi Radicals": range(42128, 42191 + 1), + "Lisu": range(42192, 42239 + 1), + "Vai": range(42240, 42559 + 1), + "Cyrillic Extended-B": range(42560, 42655 + 1), + "Bamum": range(42656, 42751 + 1), + "Modifier Tone Letters": range(42752, 42783 + 1), + "Latin Extended-D": range(42784, 43007 + 1), + "Syloti Nagri": range(43008, 43055 + 1), + "Common Indic Number Forms": range(43056, 43071 + 1), + "Phags-pa": range(43072, 43135 + 1), + "Saurashtra": range(43136, 43231 + 1), + "Devanagari Extended": range(43232, 43263 + 1), + "Kayah Li": range(43264, 43311 + 1), + "Rejang": range(43312, 43359 + 1), + "Hangul Jamo Extended-A": range(43360, 43391 + 1), + "Javanese": range(43392, 43487 + 1), + "Myanmar Extended-B": range(43488, 43519 + 1), + "Cham": range(43520, 43615 + 1), + "Myanmar Extended-A": range(43616, 43647 + 1), + "Tai Viet": range(43648, 43743 + 1), + "Meetei Mayek Extensions": range(43744, 43775 + 1), + "Ethiopic Extended-A": range(43776, 43823 + 1), + "Latin Extended-E": range(43824, 43887 + 1), + "Cherokee Supplement": range(43888, 43967 + 1), + "Meetei Mayek": range(43968, 44031 + 1), + "Hangul Syllables": range(44032, 55215 + 1), + "Hangul Jamo Extended-B": range(55216, 55295 + 1), + "High Surrogates": range(55296, 56191 + 1), + "High Private Use Surrogates": range(56192, 56319 + 1), + "Low Surrogates": range(56320, 57343 + 1), + "Private Use Area": range(57344, 63743 + 1), + "CJK Compatibility Ideographs": range(63744, 64255 + 1), + "Alphabetic Presentation Forms": range(64256, 64335 + 1), + "Arabic Presentation Forms-A": range(64336, 65023 + 1), + "Variation Selectors": range(65024, 65039 + 1), + "Vertical Forms": range(65040, 65055 + 1), + "Combining Half Marks": range(65056, 65071 + 1), + "CJK Compatibility Forms": range(65072, 65103 + 1), + "Small Form Variants": range(65104, 65135 + 1), + "Arabic Presentation Forms-B": range(65136, 65279 + 1), + "Halfwidth and Fullwidth Forms": range(65280, 65519 + 1), + "Specials": range(65520, 65535 + 1), + "Linear B Syllabary": range(65536, 65663 + 1), + "Linear B Ideograms": range(65664, 65791 + 1), + "Aegean Numbers": range(65792, 65855 + 1), + "Ancient Greek Numbers": range(65856, 65935 + 1), + "Ancient Symbols": range(65936, 65999 + 1), + "Phaistos Disc": range(66000, 66047 + 1), + "Lycian": range(66176, 66207 + 1), + "Carian": range(66208, 66271 + 1), + "Coptic Epact Numbers": range(66272, 66303 + 1), + "Old Italic": range(66304, 66351 + 1), + "Gothic": range(66352, 66383 + 1), + "Old Permic": range(66384, 66431 + 1), + "Ugaritic": range(66432, 66463 + 1), + "Old Persian": range(66464, 66527 + 1), + "Deseret": range(66560, 66639 + 1), + "Shavian": range(66640, 66687 + 1), + "Osmanya": range(66688, 66735 + 1), + "Osage": range(66736, 66815 + 1), + "Elbasan": range(66816, 66863 + 1), + "Caucasian Albanian": range(66864, 66927 + 1), + "Linear A": range(67072, 67455 + 1), + "Cypriot Syllabary": range(67584, 67647 + 1), + "Imperial Aramaic": range(67648, 67679 + 1), + "Palmyrene": range(67680, 67711 + 1), + "Nabataean": range(67712, 67759 + 1), + "Hatran": range(67808, 67839 + 1), + "Phoenician": range(67840, 67871 + 1), + "Lydian": range(67872, 67903 + 1), + "Meroitic Hieroglyphs": range(67968, 67999 + 1), + "Meroitic Cursive": range(68000, 68095 + 1), + "Kharoshthi": range(68096, 68191 + 1), + "Old South Arabian": range(68192, 68223 + 1), + "Old North Arabian": range(68224, 68255 + 1), + "Manichaean": range(68288, 68351 + 1), + "Avestan": range(68352, 68415 + 1), + "Inscriptional Parthian": range(68416, 68447 + 1), + "Inscriptional Pahlavi": range(68448, 68479 + 1), + "Psalter Pahlavi": range(68480, 68527 + 1), + "Old Turkic": range(68608, 68687 + 1), + "Old Hungarian": range(68736, 68863 + 1), + "Rumi Numeral Symbols": range(69216, 69247 + 1), + "Brahmi": range(69632, 69759 + 1), + "Kaithi": range(69760, 69839 + 1), + "Sora Sompeng": range(69840, 69887 + 1), + "Chakma": range(69888, 69967 + 1), + "Mahajani": range(69968, 70015 + 1), + "Sharada": range(70016, 70111 + 1), + "Sinhala Archaic Numbers": range(70112, 70143 + 1), + "Khojki": range(70144, 70223 + 1), + "Multani": range(70272, 70319 + 1), + "Khudawadi": range(70320, 70399 + 1), + "Grantha": range(70400, 70527 + 1), + "Newa": range(70656, 70783 + 1), + "Tirhuta": range(70784, 70879 + 1), + "Siddham": range(71040, 71167 + 1), + "Modi": range(71168, 71263 + 1), + "Mongolian Supplement": range(71264, 71295 + 1), + "Takri": range(71296, 71375 + 1), + "Ahom": range(71424, 71487 + 1), + "Warang Citi": range(71840, 71935 + 1), + "Zanabazar Square": range(72192, 72271 + 1), + "Soyombo": range(72272, 72367 + 1), + "Pau Cin Hau": range(72384, 72447 + 1), + "Bhaiksuki": range(72704, 72815 + 1), + "Marchen": range(72816, 72895 + 1), + "Masaram Gondi": range(72960, 73055 + 1), + "Cuneiform": range(73728, 74751 + 1), + "Cuneiform Numbers and Punctuation": range(74752, 74879 + 1), + "Early Dynastic Cuneiform": range(74880, 75087 + 1), + "Egyptian Hieroglyphs": range(77824, 78895 + 1), + "Anatolian Hieroglyphs": range(82944, 83583 + 1), + "Bamum Supplement": range(92160, 92735 + 1), + "Mro": range(92736, 92783 + 1), + "Bassa Vah": range(92880, 92927 + 1), + "Pahawh Hmong": range(92928, 93071 + 1), + "Miao": range(93952, 94111 + 1), + "Ideographic Symbols and Punctuation": range(94176, 94207 + 1), + "Tangut": range(94208, 100351 + 1), + "Tangut Components": range(100352, 101119 + 1), + "Kana Supplement": range(110592, 110847 + 1), + "Kana Extended-A": range(110848, 110895 + 1), + "Nushu": range(110960, 111359 + 1), + "Duployan": range(113664, 113823 + 1), + "Shorthand Format Controls": range(113824, 113839 + 1), + "Byzantine Musical Symbols": range(118784, 119039 + 1), + "Musical Symbols": range(119040, 119295 + 1), + "Ancient Greek Musical Notation": range(119296, 119375 + 1), + "Tai Xuan Jing Symbols": range(119552, 119647 + 1), + "Counting Rod Numerals": range(119648, 119679 + 1), + "Mathematical Alphanumeric Symbols": range(119808, 120831 + 1), + "Sutton SignWriting": range(120832, 121519 + 1), + "Glagolitic Supplement": range(122880, 122927 + 1), + "Mende Kikakui": range(124928, 125151 + 1), + "Adlam": range(125184, 125279 + 1), + "Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1), + "Mahjong Tiles": range(126976, 127023 + 1), + "Domino Tiles": range(127024, 127135 + 1), + "Playing Cards": range(127136, 127231 + 1), + "Enclosed Alphanumeric Supplement": range(127232, 127487 + 1), + "Enclosed Ideographic Supplement": range(127488, 127743 + 1), + "Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1), + "Emoticons range(Emoji)": range(128512, 128591 + 1), + "Ornamental Dingbats": range(128592, 128639 + 1), + "Transport and Map Symbols": range(128640, 128767 + 1), + "Alchemical Symbols": range(128768, 128895 + 1), + "Geometric Shapes Extended": range(128896, 129023 + 1), + "Supplemental Arrows-C": range(129024, 129279 + 1), + "Supplemental Symbols and Pictographs": range(129280, 129535 + 1), + "CJK Unified Ideographs Extension B": range(131072, 173791 + 1), + "CJK Unified Ideographs Extension C": range(173824, 177983 + 1), + "CJK Unified Ideographs Extension D": range(177984, 178207 + 1), + "CJK Unified Ideographs Extension E": range(178208, 183983 + 1), + "CJK Unified Ideographs Extension F": range(183984, 191471 + 1), + "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1), + "Tags": range(917504, 917631 + 1), + "Variation Selectors Supplement": range(917760, 917999 + 1), +} # type: Dict[str, range] UNICODE_SECONDARY_RANGE_KEYWORD = [ - 'Supplement', - 'Extended', - 'Extensions', - 'Modifier', - 'Marks', - 'Punctuation', - 'Symbols', - 'Forms', - 'Operators', - 'Miscellaneous', - 'Drawing', - 'Block', - 'Shapes', - 'Supplemental', - 'Tags' + "Supplement", + "Extended", + "Extensions", + "Modifier", + "Marks", + "Punctuation", + "Symbols", + "Forms", + "Operators", + "Miscellaneous", + "Drawing", + "Block", + "Shapes", + "Supplemental", + "Tags", ] # type: List[str] RE_POSSIBLE_ENCODING_INDICATION = re_compile( - r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)', - IGNORECASE + r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)", + IGNORECASE, ) IANA_SUPPORTED = sorted( filter( - lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, - list(set(aliases.values())) + lambda x: x.endswith("_codec") is False + and x not in {"rot_13", "tactis", "mbcs"}, + list(set(aliases.values())), ) ) # type: List[str] @@ -66,157 +346,39 @@ IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int # pre-computed code page that are similar using the function cp_similarity. IANA_SUPPORTED_SIMILAR = { - "cp037": [ - "cp1026", - "cp1140", - "cp273", - "cp500" - ], - "cp1026": [ - "cp037", - "cp1140", - "cp273", - "cp500" - ], - "cp1125": [ - "cp866" - ], - "cp1140": [ - "cp037", - "cp1026", - "cp273", - "cp500" - ], - "cp1250": [ - "iso8859_2" - ], - "cp1251": [ - "kz1048", - "ptcp154" - ], - "cp1252": [ - "cp1258", - "iso8859_15", - "iso8859_9", - "latin_1" - ], - "cp1253": [ - "iso8859_7" - ], - "cp1254": [ - "cp1258", - "iso8859_15", - "iso8859_9", - "latin_1" - ], - "cp1257": [ - "iso8859_13" - ], - "cp1258": [ - "cp1252", - "cp1254", - "iso8859_9", - "latin_1" - ], - "cp273": [ - "cp037", - "cp1026", - "cp1140", - "cp500" - ], - "cp437": [ - "cp850", - "cp858", - "cp860", - "cp861", - "cp862", - "cp863", - "cp865" - ], - "cp500": [ - "cp037", - "cp1026", - "cp1140", - "cp273" - ], - "cp850": [ - "cp437", - "cp857", - "cp858", - "cp865" - ], - "cp857": [ - "cp850", - "cp858", - "cp865" - ], - "cp858": [ - "cp437", - "cp850", - "cp857", - "cp865" - ], - "cp860": [ - "cp437", - "cp861", - "cp862", - "cp863", - "cp865" - ], - "cp861": [ - "cp437", - "cp860", - "cp862", - "cp863", - "cp865" - ], - "cp862": [ - "cp437", - "cp860", - "cp861", - "cp863", - "cp865" - ], - "cp863": [ - "cp437", - "cp860", - "cp861", - "cp862", - "cp865" - ], - "cp865": [ - "cp437", - "cp850", - "cp857", - "cp858", - "cp860", - "cp861", - "cp862", - "cp863" - ], - "cp866": [ - "cp1125" - ], - "iso8859_10": [ - "iso8859_14", - "iso8859_15", - "iso8859_4", - "iso8859_9", - "latin_1" - ], - "iso8859_11": [ - "tis_620" - ], - "iso8859_13": [ - "cp1257" - ], + "cp037": ["cp1026", "cp1140", "cp273", "cp500"], + "cp1026": ["cp037", "cp1140", "cp273", "cp500"], + "cp1125": ["cp866"], + "cp1140": ["cp037", "cp1026", "cp273", "cp500"], + "cp1250": ["iso8859_2"], + "cp1251": ["kz1048", "ptcp154"], + "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"], + "cp1253": ["iso8859_7"], + "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"], + "cp1257": ["iso8859_13"], + "cp1258": ["cp1252", "cp1254", "iso8859_9", "latin_1"], + "cp273": ["cp037", "cp1026", "cp1140", "cp500"], + "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"], + "cp500": ["cp037", "cp1026", "cp1140", "cp273"], + "cp850": ["cp437", "cp857", "cp858", "cp865"], + "cp857": ["cp850", "cp858", "cp865"], + "cp858": ["cp437", "cp850", "cp857", "cp865"], + "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"], + "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"], + "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"], + "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"], + "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"], + "cp866": ["cp1125"], + "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"], + "iso8859_11": ["tis_620"], + "iso8859_13": ["cp1257"], "iso8859_14": [ "iso8859_10", "iso8859_15", "iso8859_16", "iso8859_3", "iso8859_9", - "latin_1" + "latin_1", ], "iso8859_15": [ "cp1252", @@ -226,7 +388,7 @@ IANA_SUPPORTED_SIMILAR = { "iso8859_16", "iso8859_3", "iso8859_9", - "latin_1" + "latin_1", ], "iso8859_16": [ "iso8859_14", @@ -234,29 +396,12 @@ IANA_SUPPORTED_SIMILAR = { "iso8859_2", "iso8859_3", "iso8859_9", - "latin_1" - ], - "iso8859_2": [ - "cp1250", - "iso8859_16", - "iso8859_4" - ], - "iso8859_3": [ - "iso8859_14", - "iso8859_15", - "iso8859_16", - "iso8859_9", - "latin_1" - ], - "iso8859_4": [ - "iso8859_10", - "iso8859_2", - "iso8859_9", - "latin_1" - ], - "iso8859_7": [ - "cp1253" + "latin_1", ], + "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"], + "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"], + "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"], + "iso8859_7": ["cp1253"], "iso8859_9": [ "cp1252", "cp1254", @@ -267,12 +412,9 @@ IANA_SUPPORTED_SIMILAR = { "iso8859_16", "iso8859_3", "iso8859_4", - "latin_1" - ], - "kz1048": [ - "cp1251", - "ptcp154" + "latin_1", ], + "kz1048": ["cp1251", "ptcp154"], "latin_1": [ "cp1252", "cp1254", @@ -283,61 +425,72 @@ IANA_SUPPORTED_SIMILAR = { "iso8859_16", "iso8859_3", "iso8859_4", - "iso8859_9" + "iso8859_9", ], - "mac_iceland": [ - "mac_roman", - "mac_turkish" - ], - "mac_roman": [ - "mac_iceland", - "mac_turkish" - ], - "mac_turkish": [ - "mac_iceland", - "mac_roman" - ], - "ptcp154": [ - "cp1251", - "kz1048" - ], - "tis_620": [ - "iso8859_11" - ] + "mac_iceland": ["mac_roman", "mac_turkish"], + "mac_roman": ["mac_iceland", "mac_turkish"], + "mac_turkish": ["mac_iceland", "mac_roman"], + "ptcp154": ["cp1251", "kz1048"], + "tis_620": ["iso8859_11"], } # type: Dict[str, List[str]] CHARDET_CORRESPONDENCE = { - 'iso2022_kr': 'ISO-2022-KR', - 'iso2022_jp': 'ISO-2022-JP', - 'euc_kr': 'EUC-KR', - 'tis_620': 'TIS-620', - 'utf_32': 'UTF-32', - 'euc_jp': 'EUC-JP', - 'koi8_r': 'KOI8-R', - 'iso8859_1': 'ISO-8859-1', - 'iso8859_2': 'ISO-8859-2', - 'iso8859_5': 'ISO-8859-5', - 'iso8859_6': 'ISO-8859-6', - 'iso8859_7': 'ISO-8859-7', - 'iso8859_8': 'ISO-8859-8', - 'utf_16': 'UTF-16', - 'cp855': 'IBM855', - 'mac_cyrillic': 'MacCyrillic', - 'gb2312': 'GB2312', - 'gb18030': 'GB18030', - 'cp932': 'CP932', - 'cp866': 'IBM866', - 'utf_8': 'utf-8', - 'utf_8_sig': 'UTF-8-SIG', - 'shift_jis': 'SHIFT_JIS', - 'big5': 'Big5', - 'cp1250': 'windows-1250', - 'cp1251': 'windows-1251', - 'cp1252': 'Windows-1252', - 'cp1253': 'windows-1253', - 'cp1255': 'windows-1255', - 'cp1256': 'windows-1256', - 'cp1254': 'Windows-1254', - 'cp949': 'CP949' + "iso2022_kr": "ISO-2022-KR", + "iso2022_jp": "ISO-2022-JP", + "euc_kr": "EUC-KR", + "tis_620": "TIS-620", + "utf_32": "UTF-32", + "euc_jp": "EUC-JP", + "koi8_r": "KOI8-R", + "iso8859_1": "ISO-8859-1", + "iso8859_2": "ISO-8859-2", + "iso8859_5": "ISO-8859-5", + "iso8859_6": "ISO-8859-6", + "iso8859_7": "ISO-8859-7", + "iso8859_8": "ISO-8859-8", + "utf_16": "UTF-16", + "cp855": "IBM855", + "mac_cyrillic": "MacCyrillic", + "gb2312": "GB2312", + "gb18030": "GB18030", + "cp932": "CP932", + "cp866": "IBM866", + "utf_8": "utf-8", + "utf_8_sig": "UTF-8-SIG", + "shift_jis": "SHIFT_JIS", + "big5": "Big5", + "cp1250": "windows-1250", + "cp1251": "windows-1251", + "cp1252": "Windows-1252", + "cp1253": "windows-1253", + "cp1255": "windows-1255", + "cp1256": "windows-1256", + "cp1254": "Windows-1254", + "cp949": "CP949", } # type: Dict[str, str] + + +COMMON_SAFE_ASCII_CHARACTERS = { + "<", + ">", + "=", + ":", + "/", + "&", + ";", + "{", + "}", + "[", + "]", + ",", + "|", + '"', + "-", +} # type: Set[str] + + +KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str] +ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str] + +NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") diff --git a/pipenv/vendor/charset_normalizer/legacy.py b/pipenv/vendor/charset_normalizer/legacy.py index c4d7cd74..cdebe2b8 100644 --- a/pipenv/vendor/charset_normalizer/legacy.py +++ b/pipenv/vendor/charset_normalizer/legacy.py @@ -1,7 +1,10 @@ -from pipenv.vendor.charset_normalizer.api import from_bytes -from pipenv.vendor.charset_normalizer.constant import CHARDET_CORRESPONDENCE +import warnings from typing import Dict, Optional, Union +from .api import from_bytes, from_fp, from_path, normalize +from .constant import CHARDET_CORRESPONDENCE +from .models import CharsetMatch, CharsetMatches + def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: """ @@ -14,8 +17,10 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: :param byte_str: The byte sequence to examine. """ if not isinstance(byte_str, (bytearray, bytes)): - raise TypeError('Expected object of type bytes or bytearray, got: ' - '{0}'.format(type(byte_str))) + raise TypeError( # pragma: nocover + "Expected object of type bytes or bytearray, got: " + "{0}".format(type(byte_str)) + ) if isinstance(byte_str, bytearray): byte_str = bytes(byte_str) @@ -23,16 +28,68 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: r = from_bytes(byte_str).best() encoding = r.encoding if r is not None else None - language = r.language if r is not None and r.language != 'Unknown' else '' - confidence = 1. - r.chaos if r is not None else None + language = r.language if r is not None and r.language != "Unknown" else "" + confidence = 1.0 - r.chaos if r is not None else None # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process # but chardet does return 'utf-8-sig' and it is a valid codec name. - if r is not None and encoding == 'utf_8' and r.bom: - encoding += '_sig' + if r is not None and encoding == "utf_8" and r.bom: + encoding += "_sig" return { - 'encoding': encoding if encoding not in CHARDET_CORRESPONDENCE else CHARDET_CORRESPONDENCE[encoding], - 'language': language, - 'confidence': confidence + "encoding": encoding + if encoding not in CHARDET_CORRESPONDENCE + else CHARDET_CORRESPONDENCE[encoding], + "language": language, + "confidence": confidence, } + + +class CharsetNormalizerMatch(CharsetMatch): + pass + + +class CharsetNormalizerMatches(CharsetMatches): + @staticmethod + def from_fp(*args, **kwargs): # type: ignore + warnings.warn( # pragma: nocover + "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " + "and scheduled to be removed in 3.0", + DeprecationWarning, + ) + return from_fp(*args, **kwargs) # pragma: nocover + + @staticmethod + def from_bytes(*args, **kwargs): # type: ignore + warnings.warn( # pragma: nocover + "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " + "and scheduled to be removed in 3.0", + DeprecationWarning, + ) + return from_bytes(*args, **kwargs) # pragma: nocover + + @staticmethod + def from_path(*args, **kwargs): # type: ignore + warnings.warn( # pragma: nocover + "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " + "and scheduled to be removed in 3.0", + DeprecationWarning, + ) + return from_path(*args, **kwargs) # pragma: nocover + + @staticmethod + def normalize(*args, **kwargs): # type: ignore + warnings.warn( # pragma: nocover + "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " + "and scheduled to be removed in 3.0", + DeprecationWarning, + ) + return normalize(*args, **kwargs) # pragma: nocover + + +class CharsetDetector(CharsetNormalizerMatches): + pass + + +class CharsetDoctor(CharsetNormalizerMatches): + pass diff --git a/pipenv/vendor/charset_normalizer/md.py b/pipenv/vendor/charset_normalizer/md.py index b8c5fbf5..2146d61d 100644 --- a/pipenv/vendor/charset_normalizer/md.py +++ b/pipenv/vendor/charset_normalizer/md.py @@ -1,9 +1,24 @@ from functools import lru_cache -from typing import Optional, List +from typing import List, Optional -from pipenv.vendor.charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD -from pipenv.vendor.charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \ - remove_accent, is_separator, is_cjk +from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD +from .utils import ( + is_accentuated, + is_ascii, + is_case_variable, + is_cjk, + is_emoticon, + is_hangul, + is_hiragana, + is_katakana, + is_latin, + is_punctuation, + is_separator, + is_symbol, + is_thai, + remove_accent, + unicode_range, +) class MessDetectorPlugin: @@ -41,8 +56,7 @@ class MessDetectorPlugin: class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._punctuation_count = 0 # type: int self._symbol_count = 0 # type: int self._character_count = 0 # type: int @@ -56,10 +70,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): def feed(self, character: str) -> None: self._character_count += 1 - if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]"]: + if ( + character != self._last_printable_char + and character not in COMMON_SAFE_ASCII_CHARACTERS + ): if is_punctuation(character): self._punctuation_count += 1 - elif character.isdigit() is False and is_symbol(character): + elif ( + character.isdigit() is False + and is_symbol(character) + and is_emoticon(character) is False + ): self._symbol_count += 2 self._last_printable_char = character @@ -72,16 +93,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): @property def ratio(self) -> float: if self._character_count == 0: - return 0. + return 0.0 - ratio_of_punctuation = (self._punctuation_count + self._symbol_count) / self._character_count # type: float + ratio_of_punctuation = ( + self._punctuation_count + self._symbol_count + ) / self._character_count # type: float - return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0. + return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 class TooManyAccentuatedPlugin(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._character_count = 0 # type: int self._accentuated_count = 0 # type: int @@ -101,14 +123,15 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin): @property def ratio(self) -> float: if self._character_count == 0: - return 0. - ratio_of_accentuation = self._accentuated_count / self._character_count # type: float - return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0. + return 0.0 + ratio_of_accentuation = ( + self._accentuated_count / self._character_count + ) # type: float + return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 class UnprintablePlugin(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._unprintable_count = 0 # type: int self._character_count = 0 # type: int @@ -116,7 +139,11 @@ class UnprintablePlugin(MessDetectorPlugin): return True def feed(self, character: str) -> None: - if character not in {'\n', '\t', '\r'} and character.isprintable() is False: + if ( + character.isspace() is False # includes \n \t \r \v + and character.isprintable() is False + and character != "\x1A" # Why? Its the ASCII substitute character. + ): self._unprintable_count += 1 self._character_count += 1 @@ -126,26 +153,31 @@ class UnprintablePlugin(MessDetectorPlugin): @property def ratio(self) -> float: if self._character_count == 0: - return 0. + return 0.0 return (self._unprintable_count * 8) / self._character_count class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._successive_count = 0 # type: int self._character_count = 0 # type: int self._last_latin_character = None # type: Optional[str] def eligible(self, character: str) -> bool: - return is_latin(character) + return character.isalpha() and is_latin(character) def feed(self, character: str) -> None: + self._character_count += 1 if self._last_latin_character is not None: if is_accentuated(character) and is_accentuated(self._last_latin_character): - if remove_accent(character) == remove_accent(self._last_latin_character): + if character.isupper() and self._last_latin_character.isupper(): + self._successive_count += 1 + # Worse if its the same char duplicated with different accent. + if remove_accent(character) == remove_accent( + self._last_latin_character + ): self._successive_count += 1 self._last_latin_character = character @@ -157,14 +189,13 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): @property def ratio(self) -> float: if self._character_count == 0: - return 0. + return 0.0 return (self._successive_count * 2) / self._character_count class SuspiciousRange(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._suspicious_successive_range_count = 0 # type: int self._character_count = 0 # type: int self._last_printable_seen = None # type: Optional[str] @@ -175,15 +206,21 @@ class SuspiciousRange(MessDetectorPlugin): def feed(self, character: str) -> None: self._character_count += 1 + if ( + character.isspace() + or is_punctuation(character) + or character in COMMON_SAFE_ASCII_CHARACTERS + ): + self._last_printable_seen = None + return + if self._last_printable_seen is None: self._last_printable_seen = character return - if character.isspace() or is_punctuation(character): - self._last_printable_seen = None - return - - unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str] + unicode_range_a = unicode_range( + self._last_printable_seen + ) # type: Optional[str] unicode_range_b = unicode_range(character) # type: Optional[str] if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): @@ -199,22 +236,24 @@ class SuspiciousRange(MessDetectorPlugin): @property def ratio(self) -> float: if self._character_count == 0: - return 0. + return 0.0 - ratio_of_suspicious_range_usage = (self._suspicious_successive_range_count * 2) / self._character_count # type: float + ratio_of_suspicious_range_usage = ( + self._suspicious_successive_range_count * 2 + ) / self._character_count # type: float if ratio_of_suspicious_range_usage < 0.1: - return 0. + return 0.0 return ratio_of_suspicious_range_usage class SuperWeirdWordPlugin(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._word_count = 0 # type: int self._bad_word_count = 0 # type: int self._is_current_word_bad = False # type: bool + self._foreign_long_watch = False # type: bool self._character_count = 0 # type: int self._bad_character_count = 0 # type: int @@ -230,16 +269,30 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): self._buffer = "".join([self._buffer, character]) if is_accentuated(character): self._buffer_accent_count += 1 + if ( + self._foreign_long_watch is False + and is_latin(character) is False + and is_cjk(character) is False + and is_hangul(character) is False + and is_katakana(character) is False + and is_hiragana(character) is False + and is_thai(character) is False + ): + self._foreign_long_watch = True return if not self._buffer: return - if (character.isspace() or is_punctuation(character) or is_separator(character)) and self._buffer: + if ( + character.isspace() or is_punctuation(character) or is_separator(character) + ) and self._buffer: self._word_count += 1 buffer_length = len(self._buffer) # type: int self._character_count += buffer_length - if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3: + if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34: + self._is_current_word_bad = True + if buffer_length >= 24 and self._foreign_long_watch: self._is_current_word_bad = True if self._is_current_word_bad: @@ -247,15 +300,21 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): self._bad_character_count += len(self._buffer) self._is_current_word_bad = False + self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 - elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character): + elif ( + character not in {"<", ">", "-", "="} + and character.isdigit() is False + and is_symbol(character) + ): self._is_current_word_bad = True self._buffer += character def reset(self) -> None: self._buffer = "" self._is_current_word_bad = False + self._foreign_long_watch = False self._bad_word_count = 0 self._word_count = 0 self._character_count = 0 @@ -263,19 +322,19 @@ class SuperWeirdWordPlugin(MessDetectorPlugin): @property def ratio(self) -> float: - if self._word_count <= 16: - return 0. + if self._word_count <= 10: + return 0.0 return self._bad_character_count / self._character_count class CjkInvalidStopPlugin(MessDetectorPlugin): """ - GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. - Searching for the overuse of '丅' and '丄'. + GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and + can be easily detected. Searching for the overuse of '丅' and '丄'. """ - def __init__(self): + def __init__(self) -> None: self._wrong_stop_count = 0 # type: int self._cjk_character_count = 0 # type: int @@ -296,13 +355,12 @@ class CjkInvalidStopPlugin(MessDetectorPlugin): @property def ratio(self) -> float: if self._cjk_character_count < 16: - return 0. + return 0.0 return self._wrong_stop_count / self._cjk_character_count class ArchaicUpperLowerPlugin(MessDetectorPlugin): - - def __init__(self): + def __init__(self) -> None: self._buf = False # type: bool self._character_count_since_last_sep = 0 # type: int @@ -313,27 +371,51 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin): self._character_count = 0 # type: int self._last_alpha_seen = None # type: Optional[str] + self._current_ascii_only = True # type: bool def eligible(self, character: str) -> bool: - return character.isspace() or character.isalpha() + return True def feed(self, character: str) -> None: - if is_separator(character): - if self._character_count_since_last_sep < 24: - self._successive_upper_lower_count_final += self._successive_upper_lower_count + is_concerned = character.isalpha() and is_case_variable(character) + chunk_sep = is_concerned is False + + if chunk_sep and self._character_count_since_last_sep > 0: + if ( + self._character_count_since_last_sep <= 64 + and character.isdigit() is False + and self._current_ascii_only is False + ): + self._successive_upper_lower_count_final += ( + self._successive_upper_lower_count + ) + self._successive_upper_lower_count = 0 self._character_count_since_last_sep = 0 + self._last_alpha_seen = None + self._buf = False + self._character_count += 1 + self._current_ascii_only = True + + return + + if self._current_ascii_only is True and is_ascii(character) is False: + self._current_ascii_only = False if self._last_alpha_seen is not None: - if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()): + if (character.isupper() and self._last_alpha_seen.islower()) or ( + character.islower() and self._last_alpha_seen.isupper() + ): if self._buf is True: - self._successive_upper_lower_count += 1 + self._successive_upper_lower_count += 2 + self._buf = False else: self._buf = True else: self._buf = False self._character_count += 1 + self._character_count_since_last_sep += 1 self._last_alpha_seen = character def reset(self) -> None: @@ -342,16 +424,20 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin): self._successive_upper_lower_count = 0 self._successive_upper_lower_count_final = 0 self._last_alpha_seen = None + self._buf = False + self._current_ascii_only = True @property def ratio(self) -> float: if self._character_count == 0: - return 0. + return 0.0 - return (self._successive_upper_lower_count_final * 2) / self._character_count + return self._successive_upper_lower_count_final / self._character_count -def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool: +def is_suspiciously_successive_range( + unicode_range_a: Optional[str], unicode_range_b: Optional[str] +) -> bool: """ Determine if two Unicode range seen next to each other can be considered as suspicious. """ @@ -367,7 +453,9 @@ def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_ran if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: return False - keywords_range_a, keywords_range_b = unicode_range_a.split(" "), unicode_range_b.split(" ") + keywords_range_a, keywords_range_b = unicode_range_a.split( + " " + ), unicode_range_b.split(" ") for el in keywords_range_a: if el in UNICODE_SECONDARY_RANGE_KEYWORD: @@ -376,12 +464,19 @@ def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_ran return False # Japanese Exception - if unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']: - return False - - if unicode_range_a in ['Katakana', 'Hiragana'] or unicode_range_b in ['Katakana', 'Hiragana']: + range_a_jp_chars, range_b_jp_chars = ( + unicode_range_a + in ( + "Hiragana", + "Katakana", + ), + unicode_range_b in ("Hiragana", "Katakana"), + ) + if range_a_jp_chars or range_b_jp_chars: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: return False + if range_a_jp_chars and range_b_jp_chars: + return False if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: @@ -390,30 +485,33 @@ def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_ran return False # Chinese/Japanese use dedicated range for punctuation and/or separators. - if ('CJK' in unicode_range_a or 'CJK' in unicode_range_b) or (unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']): - if 'Punctuation' in unicode_range_a or 'Punctuation' in unicode_range_b: + if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( + unicode_range_a in ["Katakana", "Hiragana"] + and unicode_range_b in ["Katakana", "Hiragana"] + ): + if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: return False - if 'Forms' in unicode_range_a or 'Forms' in unicode_range_b: + if "Forms" in unicode_range_a or "Forms" in unicode_range_b: return False return True @lru_cache(maxsize=2048) -def mess_ratio(decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False) -> float: +def mess_ratio( + decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False +) -> float: """ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ - detectors = [] # type: List[MessDetectorPlugin] - for md_class in MessDetectorPlugin.__subclasses__(): - detectors.append( - md_class() - ) + detectors = [ + md_class() for md_class in MessDetectorPlugin.__subclasses__() + ] # type: List[MessDetectorPlugin] length = len(decoded_sequence) # type: int - mean_mess_ratio = 0. # type: float + mean_mess_ratio = 0.0 # type: float if length < 512: intermediary_mean_mess_ratio_calc = 32 # type: int @@ -427,25 +525,16 @@ def mess_ratio(decoded_sequence: str, maximum_threshold: float = 0.2, debug: boo if detector.eligible(character): detector.feed(character) - if (index > 0 and index % intermediary_mean_mess_ratio_calc == 0) or index == length-1: - mean_mess_ratio = sum( - [ - dt.ratio for dt in detectors - ] - ) + if ( + index > 0 and index % intermediary_mean_mess_ratio_calc == 0 + ) or index == length - 1: + mean_mess_ratio = sum([dt.ratio for dt in detectors]) if mean_mess_ratio >= maximum_threshold: break if debug: for dt in detectors: # pragma: nocover - print( - dt.__class__, - dt.ratio - ) - - return round( - mean_mess_ratio, - 3 - ) + print(dt.__class__, dt.ratio) + return round(mean_mess_ratio, 3) diff --git a/pipenv/vendor/charset_normalizer/models.py b/pipenv/vendor/charset_normalizer/models.py index a0a94cb7..68c27b89 100644 --- a/pipenv/vendor/charset_normalizer/models.py +++ b/pipenv/vendor/charset_normalizer/models.py @@ -1,25 +1,25 @@ import warnings +from collections import Counter from encodings.aliases import aliases from hashlib import sha256 from json import dumps -from typing import Optional, List, Tuple, Set -from collections import Counter -from re import sub, compile as re_compile +from re import sub +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from pipenv.vendor.charset_normalizer.constant import TOO_BIG_SEQUENCE -from pipenv.vendor.charset_normalizer.md import mess_ratio -from pipenv.vendor.charset_normalizer.utils import iana_name, is_multi_byte_encoding, unicode_range +from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE +from .md import mess_ratio +from .utils import iana_name, is_multi_byte_encoding, unicode_range class CharsetMatch: def __init__( - self, - payload: bytes, - guessed_encoding: str, - mean_mess_ratio: float, - has_sig_or_bom: bool, - languages: "CoherenceMatches", - decoded_payload: Optional[str] = None + self, + payload: bytes, + guessed_encoding: str, + mean_mess_ratio: float, + has_sig_or_bom: bool, + languages: "CoherenceMatches", + decoded_payload: Optional[str] = None, ): self._payload = payload # type: bytes @@ -30,19 +30,23 @@ class CharsetMatch: self._unicode_ranges = None # type: Optional[List[str]] self._leaves = [] # type: List[CharsetMatch] - self._mean_coherence_ratio = 0. # type: float + self._mean_coherence_ratio = 0.0 # type: float self._output_payload = None # type: Optional[bytes] self._output_encoding = None # type: Optional[str] self._string = decoded_payload # type: Optional[str] - def __eq__(self, other) -> bool: + def __eq__(self, other: object) -> bool: if not isinstance(other, CharsetMatch): - raise TypeError('__eq__ cannot be invoked on {} and {}.'.format(str(other.__class__), str(self.__class__))) + raise TypeError( + "__eq__ cannot be invoked on {} and {}.".format( + str(other.__class__), str(self.__class__) + ) + ) return self.encoding == other.encoding and self.fingerprint == other.fingerprint - def __lt__(self, other) -> bool: + def __lt__(self, other: object) -> bool: """ Implemented to make sorted available upon CharsetMatches items. """ @@ -50,13 +54,21 @@ class CharsetMatch: raise ValueError chaos_difference = abs(self.chaos - other.chaos) # type: float + coherence_difference = abs(self.coherence - other.coherence) # type: float # Bellow 1% difference --> Use Coherence - if chaos_difference < 0.01: + if chaos_difference < 0.01 and coherence_difference > 0.02: + # When having a tough decision, use the result that decoded as many multi-byte as possible. + if chaos_difference == 0.0 and self.coherence == other.coherence: + return self.multi_byte_usage > other.multi_byte_usage return self.coherence > other.coherence return self.chaos < other.chaos + @property + def multi_byte_usage(self) -> float: + return 1.0 - len(str(self)) / len(self.raw) + @property def chaos_secondary_pass(self) -> float: """ @@ -64,11 +76,11 @@ class CharsetMatch: Use with caution, this can be very slow. Notice: Will be removed in 3.0 """ - warnings.warn("chaos_secondary_pass is deprecated and will be removed in 3.0", DeprecationWarning) - return mess_ratio( - str(self), - 1. + warnings.warn( + "chaos_secondary_pass is deprecated and will be removed in 3.0", + DeprecationWarning, ) + return mess_ratio(str(self), 1.0) @property def coherence_non_latin(self) -> float: @@ -76,8 +88,11 @@ class CharsetMatch: Coherence ratio on the first non-latin language detected if ANY. Notice: Will be removed in 3.0 """ - warnings.warn("coherence_non_latin is deprecated and will be removed in 3.0", DeprecationWarning) - return 0. + warnings.warn( + "coherence_non_latin is deprecated and will be removed in 3.0", + DeprecationWarning, + ) + return 0.0 @property def w_counter(self) -> Counter: @@ -85,9 +100,11 @@ class CharsetMatch: Word counter instance on decoded text. Notice: Will be removed in 3.0 """ - warnings.warn("w_counter is deprecated and will be removed in 3.0", DeprecationWarning) - not_printable_pattern = re_compile(r'[0-9\W\n\r\t]+') - string_printable_only = sub(not_printable_pattern, ' ', str(self).lower()) + warnings.warn( + "w_counter is deprecated and will be removed in 3.0", DeprecationWarning + ) + + string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower()) return Counter(string_printable_only.split()) @@ -102,7 +119,11 @@ class CharsetMatch: def add_submatch(self, other: "CharsetMatch") -> None: if not isinstance(other, CharsetMatch) or other == self: - raise ValueError("Unable to add instance <{}> as a submatch of a CharsetMatch".format(other.__class__)) + raise ValueError( + "Unable to add instance <{}> as a submatch of a CharsetMatch".format( + other.__class__ + ) + ) other._string = None # Unload RAM usage; dirty trick. self._leaves.append(other) @@ -153,9 +174,13 @@ class CharsetMatch: return "English" # doing it there to avoid circular import - from pipenv.vendor.charset_normalizer.cd import mb_encoding_languages, encoding_languages + from charset_normalizer.cd import encoding_languages, mb_encoding_languages - languages = mb_encoding_languages(self.encoding) if is_multi_byte_encoding(self.encoding) else encoding_languages(self.encoding) + languages = ( + mb_encoding_languages(self.encoding) + if is_multi_byte_encoding(self.encoding) + else encoding_languages(self.encoding) + ) if len(languages) == 0 or "Latin Based" in languages: return "Unknown" @@ -171,7 +196,7 @@ class CharsetMatch: @property def coherence(self) -> float: if not self._languages: - return 0. + return 0.0 return self._languages[0][1] @property @@ -201,12 +226,12 @@ class CharsetMatch: def alphabets(self) -> List[str]: if self._unicode_ranges is not None: return self._unicode_ranges - detected_ranges = set() # type: Set[str] - for character in str(self): - detected_ranges.add( - unicode_range(character) - ) - self._unicode_ranges = sorted(list(detected_ranges)) + # list detected ranges + detected_ranges = [ + unicode_range(char) for char in str(self) + ] # type: List[Optional[str]] + # filter and sort + self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) return self._unicode_ranges @property @@ -254,14 +279,15 @@ class CharsetMatches: Container with every CharsetMatch items ordered by default from most probable to the less one. Act like a list(iterable) but does not implements all related methods. """ + def __init__(self, results: List[CharsetMatch] = None): self._results = sorted(results) if results else [] # type: List[CharsetMatch] - def __iter__(self): + def __iter__(self) -> Iterator[CharsetMatch]: for result in self._results: yield result - def __getitem__(self, item) -> CharsetMatch: + def __getitem__(self, item: Union[int, str]) -> CharsetMatch: """ Retrieve a single item either by its position or encoding name (alias may be used here). Raise KeyError upon invalid index or encoding not present in results. @@ -278,17 +304,24 @@ class CharsetMatches: def __len__(self) -> int: return len(self._results) + def __bool__(self) -> bool: + return len(self._results) > 0 + def append(self, item: CharsetMatch) -> None: """ Insert a single match. Will be inserted accordingly to preserve sort. Can be inserted as a submatch. """ if not isinstance(item, CharsetMatch): - raise ValueError("Cannot append instance '{}' to CharsetMatches".format(str(item.__class__))) + raise ValueError( + "Cannot append instance '{}' to CharsetMatches".format( + str(item.__class__) + ) + ) # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) if len(item.raw) <= TOO_BIG_SEQUENCE: for match in self._results: - if match.fingerprint == item.fingerprint: + if match.fingerprint == item.fingerprint and match.chaos == item.chaos: match.add_submatch(item) return self._results.append(item) @@ -314,11 +347,23 @@ CoherenceMatches = List[CoherenceMatch] class CliDetectionResult: - - def __init__(self, path: str, encoding: str, encoding_aliases: List[str], alternative_encodings: List[str], language: str, alphabets: List[str], has_sig_or_bom: bool, chaos: float, coherence: float, unicode_path: Optional[str], is_preferred: bool): + def __init__( + self, + path: str, + encoding: Optional[str], + encoding_aliases: List[str], + alternative_encodings: List[str], + language: str, + alphabets: List[str], + has_sig_or_bom: bool, + chaos: float, + coherence: float, + unicode_path: Optional[str], + is_preferred: bool, + ): self.path = path # type: str self.unicode_path = unicode_path # type: Optional[str] - self.encoding = encoding # type: str + self.encoding = encoding # type: Optional[str] self.encoding_aliases = encoding_aliases # type: List[str] self.alternative_encodings = alternative_encodings # type: List[str] self.language = language # type: str @@ -329,27 +374,20 @@ class CliDetectionResult: self.is_preferred = is_preferred # type: bool @property - def __dict__(self): + def __dict__(self) -> Dict[str, Any]: # type: ignore return { - 'path': self.path, - 'encoding': self.encoding, - 'encoding_aliases': self.encoding_aliases, - 'alternative_encodings': self.alternative_encodings, - 'language': self.language, - 'alphabets': self.alphabets, - 'has_sig_or_bom': self.has_sig_or_bom, - 'chaos': self.chaos, - 'coherence': self.coherence, - 'unicode_path': self.unicode_path, - 'is_preferred': self.is_preferred + "path": self.path, + "encoding": self.encoding, + "encoding_aliases": self.encoding_aliases, + "alternative_encodings": self.alternative_encodings, + "language": self.language, + "alphabets": self.alphabets, + "has_sig_or_bom": self.has_sig_or_bom, + "chaos": self.chaos, + "coherence": self.coherence, + "unicode_path": self.unicode_path, + "is_preferred": self.is_preferred, } def to_json(self) -> str: - return dumps( - self.__dict__, - ensure_ascii=True, - indent=4 - ) - - -CharsetNormalizerMatch = CharsetMatch + return dumps(self.__dict__, ensure_ascii=True, indent=4) diff --git a/pipenv/vendor/charset_normalizer/utils.py b/pipenv/vendor/charset_normalizer/utils.py index 1d674e4e..b9d12784 100644 --- a/pipenv/vendor/charset_normalizer/utils.py +++ b/pipenv/vendor/charset_normalizer/utils.py @@ -1,19 +1,25 @@ try: import unicodedata2 as unicodedata except ImportError: - import unicodedata + import unicodedata # type: ignore[no-redef] -from codecs import IncrementalDecoder -from re import findall -from typing import Optional, Tuple, Union, List, Set import importlib -from _multibytecodec import MultibyteIncrementalDecoder # type: ignore - +from codecs import IncrementalDecoder from encodings.aliases import aliases from functools import lru_cache +from re import findall +from typing import List, Optional, Set, Tuple, Union -from pipenv.vendor.charset_normalizer.constant import UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, \ - RE_POSSIBLE_ENCODING_INDICATION, ENCODING_MARKS, UTF8_MAXIMAL_ALLOCATION, IANA_SUPPORTED_SIMILAR +from _multibytecodec import MultibyteIncrementalDecoder # type: ignore + +from .constant import ( + ENCODING_MARKS, + IANA_SUPPORTED_SIMILAR, + RE_POSSIBLE_ENCODING_INDICATION, + UNICODE_RANGES_COMBINED, + UNICODE_SECONDARY_RANGE_KEYWORD, + UTF8_MAXIMAL_ALLOCATION, +) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -22,7 +28,14 @@ def is_accentuated(character: str) -> bool: description = unicodedata.name(character) # type: str except ValueError: return False - return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description + return ( + "WITH GRAVE" in description + or "WITH ACUTE" in description + or "WITH CEDILLA" in description + or "WITH DIAERESIS" in description + or "WITH CIRCUMFLEX" in description + or "WITH TILDE" in description + ) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -33,12 +46,7 @@ def remove_accent(character: str) -> str: codes = decomposed.split(" ") # type: List[str] - return chr( - int( - codes[0], - 16 - ) - ) + return chr(int(codes[0], 16)) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -64,6 +72,14 @@ def is_latin(character: str) -> bool: return "LATIN" in description +def is_ascii(character: str) -> bool: + try: + character.encode("ascii") + except UnicodeEncodeError: + return False + return True + + @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_punctuation(character: str) -> bool: character_category = unicodedata.category(character) # type: str @@ -94,9 +110,19 @@ def is_symbol(character: str) -> bool: return "Forms" in character_range +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_emoticon(character: str) -> bool: + character_range = unicode_range(character) # type: Optional[str] + + if character_range is None: + return False + + return "Emoticons" in character_range + + @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_separator(character: str) -> bool: - if character.isspace() or character in ["|", "+"]: + if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]: return True character_category = unicodedata.category(character) # type: str @@ -104,12 +130,18 @@ def is_separator(character: str) -> bool: return "Z" in character_category +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_case_variable(character: str) -> bool: + return character.islower() != character.isupper() + + def is_private_use_only(character: str) -> bool: character_category = unicodedata.category(character) # type: str return "Co" == character_category +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_cjk(character: str) -> bool: try: character_name = unicodedata.name(character) @@ -119,6 +151,46 @@ def is_cjk(character: str) -> bool: return "CJK" in character_name +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_hiragana(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "HIRAGANA" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_katakana(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "KATAKANA" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_hangul(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "HANGUL" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_thai(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "THAI" in character_name + + @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) def is_unicode_range_secondary(range_name: str) -> bool: for keyword in UNICODE_SECONDARY_RANGE_KEYWORD: @@ -139,14 +211,16 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional results = findall( RE_POSSIBLE_ENCODING_INDICATION, - sequence[:seq_len if seq_len <= search_zone else search_zone].decode('ascii', errors='ignore') + sequence[: seq_len if seq_len <= search_zone else search_zone].decode( + "ascii", errors="ignore" + ), ) # type: List[str] if len(results) == 0: return None for specified_encoding in results: - specified_encoding = specified_encoding.lower().replace('-', '_') + specified_encoding = specified_encoding.lower().replace("-", "_") for encoding_alias, encoding_iana in aliases.items(): if encoding_alias == specified_encoding: @@ -162,9 +236,19 @@ def is_multi_byte_encoding(name: str) -> bool: """ Verify is a specific encoding is a multi byte one based on it IANA name """ - return name in {"utf_8", "utf_8_sig", "utf_16", "utf_16_be", "utf_16_le", "utf_32", "utf_32_le", "utf_32_be", "utf_7"} or issubclass( - importlib.import_module('encodings.{}'.format(name)).IncrementalDecoder, # type: ignore - MultibyteIncrementalDecoder + return name in { + "utf_8", + "utf_8_sig", + "utf_16", + "utf_16_be", + "utf_16_le", + "utf_32", + "utf_32_le", + "utf_32_be", + "utf_7", + } or issubclass( + importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore + MultibyteIncrementalDecoder, ) @@ -191,7 +275,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool: def iana_name(cp_name: str, strict: bool = True) -> str: - cp_name = cp_name.lower().replace('-', '_') + cp_name = cp_name.lower().replace("-", "_") for encoding_alias, encoding_iana in aliases.items(): if cp_name == encoding_alias or cp_name == encoding_iana: @@ -212,9 +296,7 @@ def range_scan(decoded_sequence: str) -> List[str]: if character_range is None: continue - ranges.add( - character_range - ) + ranges.add(character_range) return list(ranges) @@ -222,10 +304,10 @@ def range_scan(decoded_sequence: str) -> List[str]: def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): - return 0. + return 0.0 - decoder_a = importlib.import_module('encodings.{}'.format(iana_name_a)).IncrementalDecoder # type: ignore - decoder_b = importlib.import_module('encodings.{}'.format(iana_name_b)).IncrementalDecoder # type: ignore + decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore + decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore id_a = decoder_a(errors="ignore") # type: IncrementalDecoder id_b = decoder_b(errors="ignore") # type: IncrementalDecoder @@ -245,4 +327,7 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using the function cp_similarity. """ - return iana_name_a in IANA_SUPPORTED_SIMILAR and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] + return ( + iana_name_a in IANA_SUPPORTED_SIMILAR + and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] + ) diff --git a/pipenv/vendor/charset_normalizer/version.py b/pipenv/vendor/charset_normalizer/version.py index 12f4b340..98e53fb3 100644 --- a/pipenv/vendor/charset_normalizer/version.py +++ b/pipenv/vendor/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.0.3" -VERSION = __version__.split('.') +__version__ = "2.0.7" +VERSION = __version__.split(".") diff --git a/pipenv/vendor/vendor.txt b/pipenv/vendor/vendor.txt index 45d5c221..0530062e 100644 --- a/pipenv/vendor/vendor.txt +++ b/pipenv/vendor/vendor.txt @@ -3,7 +3,7 @@ attrs==21.2.0 cached-property==1.5.2 cerberus==1.3.4 certifi==2021.5.30 -charset-normalizer==2.0.3 +charset-normalizer==2.0.7 click-didyoumean==0.0.3 click==8.0.3 colorama==0.4.4 From ff4529a15e2ea90edc9c51be921e6b89e1bdf457 Mon Sep 17 00:00:00 2001 From: Frost Ming Date: Wed, 17 Nov 2021 11:09:19 +0800 Subject: [PATCH 2/5] add news entry --- news/4865.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 news/4865.bugfix.rst diff --git a/news/4865.bugfix.rst b/news/4865.bugfix.rst new file mode 100644 index 00000000..7a415209 --- /dev/null +++ b/news/4865.bugfix.rst @@ -0,0 +1 @@ +Update ``charset-normalizer`` from ``2.0.3`` to ``2.0.7``, this fixes an import error on Python 3.6. From 21b6a0839f98e6712cda51068f25d93ec6a45b6e Mon Sep 17 00:00:00 2001 From: Frost Ming Date: Wed, 17 Nov 2021 11:10:34 +0800 Subject: [PATCH 3/5] Include py3.6 testing in CI matrix --- .github/workflows/ci.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4b8c07e6..97acea0a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -41,6 +41,9 @@ jobs: matrix: python-version: [3.7, 3.8, 3.9, "3.10"] os: [MacOS, Ubuntu, Windows] + include: + - python-version: 3.6 + os: Ubuntu steps: - uses: actions/checkout@v1 From b90c4695908b281211beb13ebdc5dfa6a5797b82 Mon Sep 17 00:00:00 2001 From: Frost Ming Date: Wed, 17 Nov 2021 12:07:20 +0800 Subject: [PATCH 4/5] Update dependencies --- Pipfile | 4 +- Pipfile.lock | 239 ++++++++++++++++++++------------------------------- 2 files changed, 94 insertions(+), 149 deletions(-) diff --git a/Pipfile b/Pipfile index 8ff4ad04..a4888164 100644 --- a/Pipfile +++ b/Pipfile @@ -4,9 +4,7 @@ sphinx-click = "<3" click = "*" pytest_pypi = {path = "./tests/pytest-pypi", editable = true} stdeb = {version="*", markers="sys_platform == 'linux'"} -jedi = "*" -isort = "*" -rope = "*" +dataclasses = {version="*", markers="python_version < '3.7'"} sphinxcontrib-spelling = "<4.3.0" [packages] diff --git a/Pipfile.lock b/Pipfile.lock index 0cf8406b..7a84d97c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "2caa0b5a50a8b6911a1cb6d4c7cc8040686345e460c52a32ae7cb0f4ed34385d" + "sha256": "b6632ccfba082244f188747d88665264be87621552d2c1bbebaf36174bc24e8a" }, "pipfile-spec": 6, "requires": {}, @@ -22,13 +22,6 @@ ], "version": "==0.7.12" }, - "appdirs": { - "hashes": [ - "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", - "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128" - ], - "version": "==1.4.4" - }, "arpeggio": { "hashes": [ "sha256:bfe349f252f82f82d84cb886f1d5081d1a31451e6045275e9f90b65d0daa06f1", @@ -36,14 +29,6 @@ ], "version": "==1.10.2" }, - "atomicwrites": { - "hashes": [ - "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197", - "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a" - ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.4.0" - }, "attrs": { "hashes": [ "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", @@ -62,11 +47,11 @@ }, "backports.entry-points-selectable": { "hashes": [ - "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a", - "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc" + "sha256:7fceed9532a7aa2bd888654a7314f864a3c16a4e710b34a58cfc0f08114c663b", + "sha256:914b21a479fde881635f7af5adc7f6e38d6b274be32269070c53b698c60d5386" ], "markers": "python_version >= '2.7'", - "version": "==1.1.0" + "version": "==1.1.1" }, "beautifulsoup4": { "hashes": [ @@ -78,11 +63,11 @@ }, "black": { "hashes": [ - "sha256:6eb7448da9143ee65b856a5f3676b7dda98ad9abe0f87fce8c59291f15e82a5b", - "sha256:a9952229092e325fe5f3dae56d81f639b23f7131eb840781947e4b2886030f33" + "sha256:0b1f66cbfadcd332ceeaeecf6373d9991d451868d2e2219ad0ac1213fb701117", + "sha256:83f3852301c8dcb229e9c444dd79f573c8d31c7c2dad9bbaaa94c808630e32aa" ], "markers": "python_full_version >= '3.6.2'", - "version": "==21.10b0" + "version": "==21.11b0" }, "bleach": { "hashes": [ @@ -135,6 +120,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==0.4.4" }, + "dataclasses": { + "hashes": [ + "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f", + "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84" + ], + "markers": "python_version < '3.7'", + "version": "==0.6" + }, "distlib": { "hashes": [ "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31", @@ -144,11 +137,11 @@ }, "docutils": { "hashes": [ - "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", - "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" + "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", + "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==0.17.1" + "version": "==0.16" }, "execnet": { "hashes": [ @@ -160,11 +153,11 @@ }, "filelock": { "hashes": [ - "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8", - "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b" + "sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8", + "sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4" ], "markers": "python_version >= '3.6'", - "version": "==3.3.2" + "version": "==3.4.0" }, "flake8": { "hashes": [ @@ -200,19 +193,19 @@ }, "imagesize": { "hashes": [ - "sha256:6965f19a6a2039c7d48bca7dba2473069ff854c36ae6f19d2cde309d998228a1", - "sha256:b1f6b5a4eab1f73479a50fb79fcf729514a900c341d8503d62a62dbc4127a2b1" + "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", + "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.2.0" + "version": "==1.3.0" }, "importlib-metadata": { "hashes": [ - "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15", - "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1" + "sha256:53ccfd5c134223e497627b9815d5030edf77d2ed573922f7a0b8f8bb81a1c100", + "sha256:75bdec14c397f528724c1bfd9709d660b33a4d2e77387a3358f20b848bb5e5fb" ], "markers": "python_version >= '3.6'", - "version": "==4.8.1" + "version": "==4.8.2" }, "incremental": { "hashes": [ @@ -236,14 +229,6 @@ ], "version": "==1.6.0" }, - "isort": { - "hashes": [ - "sha256:1a18ccace2ed8910bd9458b74a3ecbafd7b2f581301b0ab65cfdd4338272d76f", - "sha256:e52ff6d38012b131628cf0f26c51e7bd3a7c81592eefe3ac71411e692f1b9345" - ], - "index": "pypi", - "version": "==5.10.0" - }, "itsdangerous": { "hashes": [ "sha256:5174094b9637652bdb841a3029700391451bd092ba3db90600dea710ba28e97c", @@ -252,21 +237,13 @@ "markers": "python_version >= '3.6'", "version": "==2.0.1" }, - "jedi": { - "hashes": [ - "sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93", - "sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707" - ], - "index": "pypi", - "version": "==0.18.0" - }, "jinja2": { "hashes": [ - "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45", - "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c" + "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", + "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" ], "markers": "python_version >= '3.6'", - "version": "==3.0.2" + "version": "==3.0.3" }, "keyring": { "hashes": [ @@ -381,14 +358,6 @@ "markers": "python_version >= '3.6'", "version": "==21.2" }, - "parso": { - "hashes": [ - "sha256:12b83492c6239ce32ff5eed6d3639d6a536170723c6f3f1506869f1ace413398", - "sha256:a8c4922db71e4fdb90e0d0bc6e50f9b273d3397925e5e60a717e719201778d22" - ], - "markers": "python_version >= '3.6'", - "version": "==0.8.2" - }, "parver": { "hashes": [ "sha256:41a548c51b006a2f2522b54293cbfd2514bffa10774ece8430c9964a20cbd8b4", @@ -545,57 +514,10 @@ }, "regex": { "hashes": [ - "sha256:0075fe4e2c2720a685fef0f863edd67740ff78c342cf20b2a79bc19388edf5db", - "sha256:0621c90f28d17260b41838b22c81a79ff436141b322960eb49c7b3f91d1cbab6", - "sha256:070336382ca92c16c45b4066c4ba9fa83fb0bd13d5553a82e07d344df8d58a84", - "sha256:075b0fdbaea81afcac5a39a0d1bb91de887dd0d93bf692a5dd69c430e7fc58cb", - "sha256:07e3755e0f070bc31567dfe447a02011bfa8444239b3e9e5cca6773a22133839", - "sha256:0ed3465acf8c7c10aa2e0f3d9671da410ead63b38a77283ef464cbb64275df58", - "sha256:17e095f7f96a4b9f24b93c2c915f31a5201a6316618d919b0593afb070a5270e", - "sha256:1d85ca137756d62c8138c971453cafe64741adad1f6a7e63a22a5a8abdbd19fa", - "sha256:20605bfad484e1341b2cbfea0708e4b211d233716604846baa54b94821f487cb", - "sha256:23f93e74409c210de4de270d4bf88fb8ab736a7400f74210df63a93728cf70d6", - "sha256:2bb7cae741de1aa03e3dd3a7d98c304871eb155921ca1f0d7cc11f5aade913fd", - "sha256:2e3ff69ab203b54ce5c480c3ccbe959394ea5beef6bd5ad1785457df7acea92e", - "sha256:30fe317332de0e50195665bc61a27d46e903d682f94042c36b3f88cb84bd7958", - "sha256:3576e173e7b4f88f683b4de7db0c2af1b209bb48b2bf1c827a6f3564fad59a97", - "sha256:35ed5714467fc606551db26f80ee5d6aa1f01185586a7bccd96f179c4b974a11", - "sha256:41c66bd6750237a8ed23028a6c9173dc0c92dc24c473e771d3bfb9ee817700c3", - "sha256:48b4f4810117a9072a5aa70f7fea5f86fa9efbe9a798312e0a05044bd707cc33", - "sha256:4abf35e16f4b639daaf05a2602c1b1d47370e01babf9821306aa138924e3fe92", - "sha256:4fba661a4966adbd2c3c08d3caad6822ecb6878f5456588e2475ae23a6e47929", - "sha256:5e85dcfc5d0f374955015ae12c08365b565c6f1eaf36dd182476a4d8e5a1cdb7", - "sha256:77f9d16f7970791f17ecce7e7f101548314ed1ee2583d4268601f30af3170856", - "sha256:7ee36d5113b6506b97f45f2e8447cb9af146e60e3f527d93013d19f6d0405f3b", - "sha256:7fab29411d75c2eb48070020a40f80255936d7c31357b086e5931c107d48306e", - "sha256:85289c25f658e3260b00178757c87f033f3d4b3e40aa4abdd4dc875ff11a94fb", - "sha256:886f459db10c0f9d17c87d6594e77be915f18d343ee138e68d259eb385f044a8", - "sha256:897c539f0f3b2c3a715be651322bef2167de1cdc276b3f370ae81a3bda62df71", - "sha256:8fbe1768feafd3d0156556677b8ff234c7bf94a8110e906b2d73506f577a3269", - "sha256:9267e4fba27e6dd1008c4f2983cc548c98b4be4444e3e342db11296c0f45512f", - "sha256:9486ebda015913909bc28763c6b92fcc3b5e5a67dee4674bceed112109f5dfb8", - "sha256:956187ff49db7014ceb31e88fcacf4cf63371e6e44d209cf8816cd4a2d61e11a", - "sha256:a56735c35a3704603d9d7b243ee06139f0837bcac2171d9ba1d638ce1df0742a", - "sha256:ab1fea8832976ad0bebb11f652b692c328043057d35e9ebc78ab0a7a30cf9a70", - "sha256:adf35d88d9cffc202e6046e4c32e1e11a1d0238b2fcf095c94f109e510ececea", - "sha256:af23b9ca9a874ef0ec20e44467b8edd556c37b0f46f93abfa93752ea7c0e8d1e", - "sha256:b3794cea825f101fe0df9af8a00f9fad8e119c91e39a28636b95ee2b45b6c2e5", - "sha256:bb11c982a849dc22782210b01d0c1b98eb3696ce655d58a54180774e4880ac66", - "sha256:be30cd315db0168063a1755fa20a31119da91afa51da2907553493516e165640", - "sha256:c6238d30dcff141de076344cf7f52468de61729c2f70d776fce12f55fe8df790", - "sha256:cb1e44d860345ab5d4f533b6c37565a22f403277f44c4d2d5e06c325da959883", - "sha256:d4bfe3bc3976ccaeb4ae32f51e631964e2f0e85b2b752721b7a02de5ce3b7f27", - "sha256:d8ee91e1c295beb5c132ebd78616814de26fedba6aa8687ea460c7f5eb289b72", - "sha256:e3c00cb5c71da655e1e5161481455479b613d500dd1bd252aa01df4f037c641f", - "sha256:e9cec3a62d146e8e122d159ab93ac32c988e2ec0dcb1e18e9e53ff2da4fbd30c", - "sha256:ef4e53e2fdc997d91f5b682f81f7dc9661db9a437acce28745d765d251902d85", - "sha256:f0148988af0182a0a4e5020e7c168014f2c55a16d11179610f7883dd48ac0ebe", - "sha256:f20f9f430c33597887ba9bd76635476928e76cad2981643ca8be277b8e97aa96", - "sha256:f5930d334c2f607711d54761956aedf8137f83f1b764b9640be21d25a976f3a4", - "sha256:f6a28e87ba69f3a4f30d775b179aac55be1ce59f55799328a0d9b6df8f16b39d", - "sha256:f9ee98d658a146cb6507be720a0ce1b44f2abef8fb43c2859791d91aace17cd5" + "sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d", + "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6" ], - "version": "==2021.11.2" + "version": "==2021.11.10" }, "requests": { "hashes": [ @@ -619,20 +541,13 @@ ], "version": "==1.5.0" }, - "rope": { - "hashes": [ - "sha256:366789e069a267296889b2ee7631f9278173b5e7d468f2ea08abe26069a52aef" - ], - "index": "pypi", - "version": "==0.21.0" - }, "setuptools": { "hashes": [ - "sha256:a481fbc56b33f5d8f6b33dce41482e64c68b668be44ff42922903b03872590bf", - "sha256:dae6b934a965c8a59d6d230d3867ec408bb95e73bd538ff77e71fedf1eaca729" + "sha256:94ee891f4759150cded601a6beb6b08400413aefd0267b692f3f8c6e0bb238e7", + "sha256:fb537610c2dfe77b5896e3ee53dd53fbdd9adc48076c8f28cee3a30fb59a5038" ], "markers": "python_version >= '3.6'", - "version": "==58.5.3" + "version": "==59.1.1" }, "six": { "hashes": [ @@ -644,26 +559,26 @@ }, "snowballstemmer": { "hashes": [ - "sha256:b51b447bea85f9968c13b650126a888aabd4cb4463fca868ec596826325dedc2", - "sha256:e997baa4f2e9139951b6f4c631bad912dfd3c792467e2f03d7239464af90e914" + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" ], - "version": "==2.1.0" + "version": "==2.2.0" }, "soupsieve": { "hashes": [ - "sha256:617ffc4d0dfd39c66f4d1413a6e165663a34eca86be9b54f97b91756300ff6df", - "sha256:e4860f889dfa88774c07da0b276b70c073b6470fa1a4a8350800bb7bce3dcc76" + "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb", + "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9" ], "markers": "python_version >= '3.6'", - "version": "==2.3" + "version": "==2.3.1" }, "sphinx": { "hashes": [ - "sha256:9f3e17c64b34afc653d7c5ec95766e03043cc6d80b0de224f59b6b6e19d37c3c", - "sha256:c7658aab75c920288a8cf6f09f244c6cfdae30d82d803ac1634d9f223a80ca08" + "sha256:19010b7b9fa0dc7756a6e105b2aacd3a80f798af3c25c273be64d7beeb482cb1", + "sha256:2320d4e994a191f4b4be27da514e46b3d6b420f2ff895d064f52415d342461e8" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==1.8.5" + "markers": "python_version >= '3.5'", + "version": "==3.5.4" }, "sphinx-click": { "hashes": [ @@ -673,6 +588,46 @@ "index": "pypi", "version": "==2.7.1" }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", + "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", + "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", + "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", + "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.3" + }, "sphinxcontrib-serializinghtml": { "hashes": [ "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", @@ -689,14 +644,6 @@ "index": "pypi", "version": "==4.2.1" }, - "sphinxcontrib-websupport": { - "hashes": [ - "sha256:4edf0223a0685a7c485ae5a156b6f529ba1ee481a1417817935b20bde1956232", - "sha256:6fc9287dfc823fe9aa432463edd6cea47fa9ebbf488d7f289b322ffcfca075c7" - ], - "markers": "python_version >= '3.5'", - "version": "==1.2.4" - }, "stdeb": { "hashes": [ "sha256:08c22c9c03b28a140fe3ec5064b53a5288279f22e596ca06b0be698d50c93cf2" @@ -737,19 +684,19 @@ }, "twine": { "hashes": [ - "sha256:218c42324121d4417cbcbbda59c623b8acc4becfce3daa545e6b6dd48bd21385", - "sha256:3725b79a6f1cfe84a134544ae1894706e60719ab28547cb6c6de781b9f72706d" + "sha256:4caad5ef4722e127b3749052fcbffaaf71719b19d4fd4973b29c469957adeba2", + "sha256:916070f8ecbd1985ebed5dbb02b9bda9a092882a96d7069d542d4fc0bb5c673c" ], "markers": "python_version >= '3.6'", - "version": "==3.5.0" + "version": "==3.6.0" }, "typing-extensions": { "hashes": [ - "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e", - "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7", - "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34" + "sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed", + "sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9" ], - "version": "==3.10.0.2" + "markers": "python_version >= '3.6'", + "version": "==4.0.0" }, "urllib3": { "hashes": [ From 4a244d26712b1105425a26135ddade7753b2f19e Mon Sep 17 00:00:00 2001 From: Frost Ming Date: Wed, 17 Nov 2021 14:41:00 +0800 Subject: [PATCH 5/5] lock on windows python36 --- Pipfile.lock | 109 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 6 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 7a84d97c..9bd75187 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -29,6 +29,14 @@ ], "version": "==1.10.2" }, + "atomicwrites": { + "hashes": [ + "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197", + "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a" + ], + "markers": "sys_platform == 'win32'", + "version": "==1.4.0" + }, "attrs": { "hashes": [ "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", @@ -117,16 +125,17 @@ "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "markers": "sys_platform == 'win32'", "version": "==0.4.4" }, "dataclasses": { "hashes": [ - "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f", - "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84" + "sha256:0201d89fa866f68c8ebd9d08ee6ff50c0b255f8ec63a71c16fda7af82bb887bf", + "sha256:8479067f342acf957dc82ec415d355ab5edb7e7646b90dc6e2fd1d96ad084c97" ], + "index": "pypi", "markers": "python_version < '3.7'", - "version": "==0.6" + "version": "==0.8" }, "distlib": { "hashes": [ @@ -204,9 +213,17 @@ "sha256:53ccfd5c134223e497627b9815d5030edf77d2ed573922f7a0b8f8bb81a1c100", "sha256:75bdec14c397f528724c1bfd9709d660b33a4d2e77387a3358f20b848bb5e5fb" ], - "markers": "python_version >= '3.6'", + "markers": "python_version < '3.8'", "version": "==4.8.2" }, + "importlib-resources": { + "hashes": [ + "sha256:33a95faed5fc19b4bc16b29a6eeae248a3fe69dd55d4d229d2b480e23eeaad45", + "sha256:d756e2f85dd4de2ba89be0b21dba2a3bbec2e871a42a3a16719258a11f87506b" + ], + "markers": "python_version < '3.7'", + "version": "==5.4.0" + }, "incremental": { "hashes": [ "sha256:02f5de5aff48f6b9f665d99d48bfc7ec03b6e3943210de7cfc88856d755d6f57", @@ -505,6 +522,14 @@ ], "version": "==2021.3" }, + "pywin32-ctypes": { + "hashes": [ + "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942", + "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.2.0" + }, "readme-renderer": { "hashes": [ "sha256:3286806450d9961d6e3b5f8a59f77e61503799aca5155c8d8d40359b4e1e1adc", @@ -514,8 +539,55 @@ }, "regex": { "hashes": [ + "sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f", + "sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc", + "sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4", + "sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4", + "sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8", + "sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f", + "sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a", + "sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef", + "sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f", + "sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc", + "sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50", "sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d", - "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6" + "sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d", + "sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733", + "sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36", + "sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345", + "sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0", + "sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12", + "sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646", + "sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667", + "sha256:7f301b11b9d214f83ddaf689181051e7f48905568b0c7017c04c06dfd065e244", + "sha256:83ee89483672b11f8952b158640d0c0ff02dc43d9cb1b70c1564b49abe92ce29", + "sha256:85bfa6a5413be0ee6c5c4a663668a2cad2cbecdee367630d097d7823041bdeec", + "sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf", + "sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4", + "sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449", + "sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a", + "sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d", + "sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb", + "sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e", + "sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83", + "sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e", + "sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a", + "sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94", + "sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc", + "sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e", + "sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965", + "sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0", + "sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36", + "sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec", + "sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23", + "sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7", + "sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe", + "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6", + "sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b", + "sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb", + "sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b", + "sha256:fbb9dc00e39f3e6c0ef48edee202f9520dafb233e8b51b06b8428cfcb92abd30", + "sha256:fff55f3ce50a3ff63ec8e2a8d3dd924f1941b250b0aac3d3d42b687eeff07a8e" ], "version": "==2021.11.10" }, @@ -690,6 +762,31 @@ "markers": "python_version >= '3.6'", "version": "==3.6.0" }, + "typed-ast": { + "hashes": [ + "sha256:14fed8820114a389a2b7e91624db5f85f3f6682fda09fe0268a59aabd28fe5f5", + "sha256:155b74b078be842d2eb630dd30a280025eca0a5383c7d45853c27afee65f278f", + "sha256:224afecb8b39739f5c9562794a7c98325cb9d972712e1a98b6989a4720219541", + "sha256:361b9e5d27bd8e3ccb6ea6ad6c4f3c0be322a1a0f8177db6d56264fa0ae40410", + "sha256:37ba2ab65a0028b1a4f2b61a8fe77f12d242731977d274a03d68ebb751271508", + "sha256:49af5b8f6f03ed1eb89ee06c1d7c2e7c8e743d720c3746a5857609a1abc94c94", + "sha256:51040bf45aacefa44fa67fb9ebcd1f2bec73182b99a532c2394eea7dabd18e24", + "sha256:52ca2b2b524d770bed7a393371a38e91943f9160a190141e0df911586066ecda", + "sha256:618912cbc7e17b4aeba86ffe071698c6e2d292acbd6d1d5ec1ee724b8c4ae450", + "sha256:65c81abbabda7d760df7304d843cc9dbe7ef5d485504ca59a46ae2d1731d2428", + "sha256:7b310a207ee9fde3f46ba327989e6cba4195bc0c8c70a158456e7b10233e6bed", + "sha256:7e6731044f748340ef68dcadb5172a4b1f40847a2983fe3983b2a66445fbc8e6", + "sha256:806e0c7346b9b4af8c62d9a29053f484599921a4448c37fbbcbbf15c25138570", + "sha256:a67fd5914603e2165e075f1b12f5a8356bfb9557e8bfb74511108cfbab0f51ed", + "sha256:e4374a76e61399a173137e7984a1d7e356038cf844f24fd8aea46c8029a2f712", + "sha256:e8a9b9c87801cecaad3b4c2b8876387115d1a14caa602c1618cedbb0cb2a14e6", + "sha256:ea517c2bb11c5e4ba7a83a91482a2837041181d57d3ed0749a6c382a2b6b7086", + "sha256:ec184dfb5d3d11e82841dbb973e7092b75f306b625fad7b2e665b64c5d60ab3f", + "sha256:ff4ad88271aa7a55f19b6a161ed44e088c393846d954729549e3cde8257747bb" + ], + "markers": "python_version < '3.8' and implementation_name == 'cpython'", + "version": "==1.5.0" + }, "typing-extensions": { "hashes": [ "sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed",