Merge pull request #4870 from pypa/vendor/charset-normalizer

[vendor] Update charset-normalizer
This commit is contained in:
Frost Ming
2021-11-17 16:01:57 +08:00
committed by GitHub
16 changed files with 2935 additions and 991 deletions
+3
View File
@@ -41,6 +41,9 @@ jobs:
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
os: [MacOS, Ubuntu, Windows]
include:
- python-version: 3.6
os: Ubuntu
steps:
- uses: actions/checkout@v1
+1 -3
View File
@@ -4,9 +4,7 @@ sphinx-click = "<3"
click = "*"
pytest_pypi = {path = "./tests/pytest-pypi", editable = true}
stdeb = {version="*", markers="sys_platform == 'linux'"}
jedi = "*"
isort = "*"
rope = "*"
dataclasses = {version="*", markers="python_version < '3.7'"}
sphinxcontrib-spelling = "<4.3.0"
[packages]
Generated
+185 -141
View File
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "2caa0b5a50a8b6911a1cb6d4c7cc8040686345e460c52a32ae7cb0f4ed34385d"
"sha256": "b6632ccfba082244f188747d88665264be87621552d2c1bbebaf36174bc24e8a"
},
"pipfile-spec": 6,
"requires": {},
@@ -22,13 +22,6 @@
],
"version": "==0.7.12"
},
"appdirs": {
"hashes": [
"sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41",
"sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"
],
"version": "==1.4.4"
},
"arpeggio": {
"hashes": [
"sha256:bfe349f252f82f82d84cb886f1d5081d1a31451e6045275e9f90b65d0daa06f1",
@@ -41,7 +34,7 @@
"sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197",
"sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"markers": "sys_platform == 'win32'",
"version": "==1.4.0"
},
"attrs": {
@@ -62,11 +55,11 @@
},
"backports.entry-points-selectable": {
"hashes": [
"sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
"sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
"sha256:7fceed9532a7aa2bd888654a7314f864a3c16a4e710b34a58cfc0f08114c663b",
"sha256:914b21a479fde881635f7af5adc7f6e38d6b274be32269070c53b698c60d5386"
],
"markers": "python_version >= '2.7'",
"version": "==1.1.0"
"version": "==1.1.1"
},
"beautifulsoup4": {
"hashes": [
@@ -78,11 +71,11 @@
},
"black": {
"hashes": [
"sha256:6eb7448da9143ee65b856a5f3676b7dda98ad9abe0f87fce8c59291f15e82a5b",
"sha256:a9952229092e325fe5f3dae56d81f639b23f7131eb840781947e4b2886030f33"
"sha256:0b1f66cbfadcd332ceeaeecf6373d9991d451868d2e2219ad0ac1213fb701117",
"sha256:83f3852301c8dcb229e9c444dd79f573c8d31c7c2dad9bbaaa94c808630e32aa"
],
"markers": "python_full_version >= '3.6.2'",
"version": "==21.10b0"
"version": "==21.11b0"
},
"bleach": {
"hashes": [
@@ -132,9 +125,18 @@
"sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b",
"sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"markers": "sys_platform == 'win32'",
"version": "==0.4.4"
},
"dataclasses": {
"hashes": [
"sha256:0201d89fa866f68c8ebd9d08ee6ff50c0b255f8ec63a71c16fda7af82bb887bf",
"sha256:8479067f342acf957dc82ec415d355ab5edb7e7646b90dc6e2fd1d96ad084c97"
],
"index": "pypi",
"markers": "python_version < '3.7'",
"version": "==0.8"
},
"distlib": {
"hashes": [
"sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
@@ -144,11 +146,11 @@
},
"docutils": {
"hashes": [
"sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125",
"sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61"
"sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af",
"sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
"version": "==0.16"
},
"execnet": {
"hashes": [
@@ -160,11 +162,11 @@
},
"filelock": {
"hashes": [
"sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
"sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
"sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8",
"sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4"
],
"markers": "python_version >= '3.6'",
"version": "==3.3.2"
"version": "==3.4.0"
},
"flake8": {
"hashes": [
@@ -200,19 +202,27 @@
},
"imagesize": {
"hashes": [
"sha256:6965f19a6a2039c7d48bca7dba2473069ff854c36ae6f19d2cde309d998228a1",
"sha256:b1f6b5a4eab1f73479a50fb79fcf729514a900c341d8503d62a62dbc4127a2b1"
"sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c",
"sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.2.0"
"version": "==1.3.0"
},
"importlib-metadata": {
"hashes": [
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
"sha256:53ccfd5c134223e497627b9815d5030edf77d2ed573922f7a0b8f8bb81a1c100",
"sha256:75bdec14c397f528724c1bfd9709d660b33a4d2e77387a3358f20b848bb5e5fb"
],
"markers": "python_version >= '3.6'",
"version": "==4.8.1"
"markers": "python_version < '3.8'",
"version": "==4.8.2"
},
"importlib-resources": {
"hashes": [
"sha256:33a95faed5fc19b4bc16b29a6eeae248a3fe69dd55d4d229d2b480e23eeaad45",
"sha256:d756e2f85dd4de2ba89be0b21dba2a3bbec2e871a42a3a16719258a11f87506b"
],
"markers": "python_version < '3.7'",
"version": "==5.4.0"
},
"incremental": {
"hashes": [
@@ -236,14 +246,6 @@
],
"version": "==1.6.0"
},
"isort": {
"hashes": [
"sha256:1a18ccace2ed8910bd9458b74a3ecbafd7b2f581301b0ab65cfdd4338272d76f",
"sha256:e52ff6d38012b131628cf0f26c51e7bd3a7c81592eefe3ac71411e692f1b9345"
],
"index": "pypi",
"version": "==5.10.0"
},
"itsdangerous": {
"hashes": [
"sha256:5174094b9637652bdb841a3029700391451bd092ba3db90600dea710ba28e97c",
@@ -252,21 +254,13 @@
"markers": "python_version >= '3.6'",
"version": "==2.0.1"
},
"jedi": {
"hashes": [
"sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93",
"sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707"
],
"index": "pypi",
"version": "==0.18.0"
},
"jinja2": {
"hashes": [
"sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
"sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
"sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8",
"sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.2"
"version": "==3.0.3"
},
"keyring": {
"hashes": [
@@ -381,14 +375,6 @@
"markers": "python_version >= '3.6'",
"version": "==21.2"
},
"parso": {
"hashes": [
"sha256:12b83492c6239ce32ff5eed6d3639d6a536170723c6f3f1506869f1ace413398",
"sha256:a8c4922db71e4fdb90e0d0bc6e50f9b273d3397925e5e60a717e719201778d22"
],
"markers": "python_version >= '3.6'",
"version": "==0.8.2"
},
"parver": {
"hashes": [
"sha256:41a548c51b006a2f2522b54293cbfd2514bffa10774ece8430c9964a20cbd8b4",
@@ -536,6 +522,14 @@
],
"version": "==2021.3"
},
"pywin32-ctypes": {
"hashes": [
"sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942",
"sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"
],
"markers": "sys_platform == 'win32'",
"version": "==0.2.0"
},
"readme-renderer": {
"hashes": [
"sha256:3286806450d9961d6e3b5f8a59f77e61503799aca5155c8d8d40359b4e1e1adc",
@@ -545,57 +539,57 @@
},
"regex": {
"hashes": [
"sha256:0075fe4e2c2720a685fef0f863edd67740ff78c342cf20b2a79bc19388edf5db",
"sha256:0621c90f28d17260b41838b22c81a79ff436141b322960eb49c7b3f91d1cbab6",
"sha256:070336382ca92c16c45b4066c4ba9fa83fb0bd13d5553a82e07d344df8d58a84",
"sha256:075b0fdbaea81afcac5a39a0d1bb91de887dd0d93bf692a5dd69c430e7fc58cb",
"sha256:07e3755e0f070bc31567dfe447a02011bfa8444239b3e9e5cca6773a22133839",
"sha256:0ed3465acf8c7c10aa2e0f3d9671da410ead63b38a77283ef464cbb64275df58",
"sha256:17e095f7f96a4b9f24b93c2c915f31a5201a6316618d919b0593afb070a5270e",
"sha256:1d85ca137756d62c8138c971453cafe64741adad1f6a7e63a22a5a8abdbd19fa",
"sha256:20605bfad484e1341b2cbfea0708e4b211d233716604846baa54b94821f487cb",
"sha256:23f93e74409c210de4de270d4bf88fb8ab736a7400f74210df63a93728cf70d6",
"sha256:2bb7cae741de1aa03e3dd3a7d98c304871eb155921ca1f0d7cc11f5aade913fd",
"sha256:2e3ff69ab203b54ce5c480c3ccbe959394ea5beef6bd5ad1785457df7acea92e",
"sha256:30fe317332de0e50195665bc61a27d46e903d682f94042c36b3f88cb84bd7958",
"sha256:3576e173e7b4f88f683b4de7db0c2af1b209bb48b2bf1c827a6f3564fad59a97",
"sha256:35ed5714467fc606551db26f80ee5d6aa1f01185586a7bccd96f179c4b974a11",
"sha256:41c66bd6750237a8ed23028a6c9173dc0c92dc24c473e771d3bfb9ee817700c3",
"sha256:48b4f4810117a9072a5aa70f7fea5f86fa9efbe9a798312e0a05044bd707cc33",
"sha256:4abf35e16f4b639daaf05a2602c1b1d47370e01babf9821306aa138924e3fe92",
"sha256:4fba661a4966adbd2c3c08d3caad6822ecb6878f5456588e2475ae23a6e47929",
"sha256:5e85dcfc5d0f374955015ae12c08365b565c6f1eaf36dd182476a4d8e5a1cdb7",
"sha256:77f9d16f7970791f17ecce7e7f101548314ed1ee2583d4268601f30af3170856",
"sha256:7ee36d5113b6506b97f45f2e8447cb9af146e60e3f527d93013d19f6d0405f3b",
"sha256:7fab29411d75c2eb48070020a40f80255936d7c31357b086e5931c107d48306e",
"sha256:85289c25f658e3260b00178757c87f033f3d4b3e40aa4abdd4dc875ff11a94fb",
"sha256:886f459db10c0f9d17c87d6594e77be915f18d343ee138e68d259eb385f044a8",
"sha256:897c539f0f3b2c3a715be651322bef2167de1cdc276b3f370ae81a3bda62df71",
"sha256:8fbe1768feafd3d0156556677b8ff234c7bf94a8110e906b2d73506f577a3269",
"sha256:9267e4fba27e6dd1008c4f2983cc548c98b4be4444e3e342db11296c0f45512f",
"sha256:9486ebda015913909bc28763c6b92fcc3b5e5a67dee4674bceed112109f5dfb8",
"sha256:956187ff49db7014ceb31e88fcacf4cf63371e6e44d209cf8816cd4a2d61e11a",
"sha256:a56735c35a3704603d9d7b243ee06139f0837bcac2171d9ba1d638ce1df0742a",
"sha256:ab1fea8832976ad0bebb11f652b692c328043057d35e9ebc78ab0a7a30cf9a70",
"sha256:adf35d88d9cffc202e6046e4c32e1e11a1d0238b2fcf095c94f109e510ececea",
"sha256:af23b9ca9a874ef0ec20e44467b8edd556c37b0f46f93abfa93752ea7c0e8d1e",
"sha256:b3794cea825f101fe0df9af8a00f9fad8e119c91e39a28636b95ee2b45b6c2e5",
"sha256:bb11c982a849dc22782210b01d0c1b98eb3696ce655d58a54180774e4880ac66",
"sha256:be30cd315db0168063a1755fa20a31119da91afa51da2907553493516e165640",
"sha256:c6238d30dcff141de076344cf7f52468de61729c2f70d776fce12f55fe8df790",
"sha256:cb1e44d860345ab5d4f533b6c37565a22f403277f44c4d2d5e06c325da959883",
"sha256:d4bfe3bc3976ccaeb4ae32f51e631964e2f0e85b2b752721b7a02de5ce3b7f27",
"sha256:d8ee91e1c295beb5c132ebd78616814de26fedba6aa8687ea460c7f5eb289b72",
"sha256:e3c00cb5c71da655e1e5161481455479b613d500dd1bd252aa01df4f037c641f",
"sha256:e9cec3a62d146e8e122d159ab93ac32c988e2ec0dcb1e18e9e53ff2da4fbd30c",
"sha256:ef4e53e2fdc997d91f5b682f81f7dc9661db9a437acce28745d765d251902d85",
"sha256:f0148988af0182a0a4e5020e7c168014f2c55a16d11179610f7883dd48ac0ebe",
"sha256:f20f9f430c33597887ba9bd76635476928e76cad2981643ca8be277b8e97aa96",
"sha256:f5930d334c2f607711d54761956aedf8137f83f1b764b9640be21d25a976f3a4",
"sha256:f6a28e87ba69f3a4f30d775b179aac55be1ce59f55799328a0d9b6df8f16b39d",
"sha256:f9ee98d658a146cb6507be720a0ce1b44f2abef8fb43c2859791d91aace17cd5"
"sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f",
"sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc",
"sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4",
"sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4",
"sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8",
"sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f",
"sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a",
"sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef",
"sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f",
"sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc",
"sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50",
"sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d",
"sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d",
"sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733",
"sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36",
"sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345",
"sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0",
"sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12",
"sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646",
"sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667",
"sha256:7f301b11b9d214f83ddaf689181051e7f48905568b0c7017c04c06dfd065e244",
"sha256:83ee89483672b11f8952b158640d0c0ff02dc43d9cb1b70c1564b49abe92ce29",
"sha256:85bfa6a5413be0ee6c5c4a663668a2cad2cbecdee367630d097d7823041bdeec",
"sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf",
"sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4",
"sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449",
"sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a",
"sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d",
"sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb",
"sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e",
"sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83",
"sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e",
"sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a",
"sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94",
"sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc",
"sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e",
"sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965",
"sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0",
"sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36",
"sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec",
"sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23",
"sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7",
"sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe",
"sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6",
"sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b",
"sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb",
"sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b",
"sha256:fbb9dc00e39f3e6c0ef48edee202f9520dafb233e8b51b06b8428cfcb92abd30",
"sha256:fff55f3ce50a3ff63ec8e2a8d3dd924f1941b250b0aac3d3d42b687eeff07a8e"
],
"version": "==2021.11.2"
"version": "==2021.11.10"
},
"requests": {
"hashes": [
@@ -619,20 +613,13 @@
],
"version": "==1.5.0"
},
"rope": {
"hashes": [
"sha256:366789e069a267296889b2ee7631f9278173b5e7d468f2ea08abe26069a52aef"
],
"index": "pypi",
"version": "==0.21.0"
},
"setuptools": {
"hashes": [
"sha256:a481fbc56b33f5d8f6b33dce41482e64c68b668be44ff42922903b03872590bf",
"sha256:dae6b934a965c8a59d6d230d3867ec408bb95e73bd538ff77e71fedf1eaca729"
"sha256:94ee891f4759150cded601a6beb6b08400413aefd0267b692f3f8c6e0bb238e7",
"sha256:fb537610c2dfe77b5896e3ee53dd53fbdd9adc48076c8f28cee3a30fb59a5038"
],
"markers": "python_version >= '3.6'",
"version": "==58.5.3"
"version": "==59.1.1"
},
"six": {
"hashes": [
@@ -644,26 +631,26 @@
},
"snowballstemmer": {
"hashes": [
"sha256:b51b447bea85f9968c13b650126a888aabd4cb4463fca868ec596826325dedc2",
"sha256:e997baa4f2e9139951b6f4c631bad912dfd3c792467e2f03d7239464af90e914"
"sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1",
"sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"
],
"version": "==2.1.0"
"version": "==2.2.0"
},
"soupsieve": {
"hashes": [
"sha256:617ffc4d0dfd39c66f4d1413a6e165663a34eca86be9b54f97b91756300ff6df",
"sha256:e4860f889dfa88774c07da0b276b70c073b6470fa1a4a8350800bb7bce3dcc76"
"sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb",
"sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9"
],
"markers": "python_version >= '3.6'",
"version": "==2.3"
"version": "==2.3.1"
},
"sphinx": {
"hashes": [
"sha256:9f3e17c64b34afc653d7c5ec95766e03043cc6d80b0de224f59b6b6e19d37c3c",
"sha256:c7658aab75c920288a8cf6f09f244c6cfdae30d82d803ac1634d9f223a80ca08"
"sha256:19010b7b9fa0dc7756a6e105b2aacd3a80f798af3c25c273be64d7beeb482cb1",
"sha256:2320d4e994a191f4b4be27da514e46b3d6b420f2ff895d064f52415d342461e8"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.8.5"
"markers": "python_version >= '3.5'",
"version": "==3.5.4"
},
"sphinx-click": {
"hashes": [
@@ -673,6 +660,46 @@
"index": "pypi",
"version": "==2.7.1"
},
"sphinxcontrib-applehelp": {
"hashes": [
"sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a",
"sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-devhelp": {
"hashes": [
"sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e",
"sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.2"
},
"sphinxcontrib-htmlhelp": {
"hashes": [
"sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07",
"sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.0"
},
"sphinxcontrib-jsmath": {
"hashes": [
"sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178",
"sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.1"
},
"sphinxcontrib-qthelp": {
"hashes": [
"sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72",
"sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"
],
"markers": "python_version >= '3.5'",
"version": "==1.0.3"
},
"sphinxcontrib-serializinghtml": {
"hashes": [
"sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd",
@@ -689,14 +716,6 @@
"index": "pypi",
"version": "==4.2.1"
},
"sphinxcontrib-websupport": {
"hashes": [
"sha256:4edf0223a0685a7c485ae5a156b6f529ba1ee481a1417817935b20bde1956232",
"sha256:6fc9287dfc823fe9aa432463edd6cea47fa9ebbf488d7f289b322ffcfca075c7"
],
"markers": "python_version >= '3.5'",
"version": "==1.2.4"
},
"stdeb": {
"hashes": [
"sha256:08c22c9c03b28a140fe3ec5064b53a5288279f22e596ca06b0be698d50c93cf2"
@@ -737,19 +756,44 @@
},
"twine": {
"hashes": [
"sha256:218c42324121d4417cbcbbda59c623b8acc4becfce3daa545e6b6dd48bd21385",
"sha256:3725b79a6f1cfe84a134544ae1894706e60719ab28547cb6c6de781b9f72706d"
"sha256:4caad5ef4722e127b3749052fcbffaaf71719b19d4fd4973b29c469957adeba2",
"sha256:916070f8ecbd1985ebed5dbb02b9bda9a092882a96d7069d542d4fc0bb5c673c"
],
"markers": "python_version >= '3.6'",
"version": "==3.5.0"
"version": "==3.6.0"
},
"typed-ast": {
"hashes": [
"sha256:14fed8820114a389a2b7e91624db5f85f3f6682fda09fe0268a59aabd28fe5f5",
"sha256:155b74b078be842d2eb630dd30a280025eca0a5383c7d45853c27afee65f278f",
"sha256:224afecb8b39739f5c9562794a7c98325cb9d972712e1a98b6989a4720219541",
"sha256:361b9e5d27bd8e3ccb6ea6ad6c4f3c0be322a1a0f8177db6d56264fa0ae40410",
"sha256:37ba2ab65a0028b1a4f2b61a8fe77f12d242731977d274a03d68ebb751271508",
"sha256:49af5b8f6f03ed1eb89ee06c1d7c2e7c8e743d720c3746a5857609a1abc94c94",
"sha256:51040bf45aacefa44fa67fb9ebcd1f2bec73182b99a532c2394eea7dabd18e24",
"sha256:52ca2b2b524d770bed7a393371a38e91943f9160a190141e0df911586066ecda",
"sha256:618912cbc7e17b4aeba86ffe071698c6e2d292acbd6d1d5ec1ee724b8c4ae450",
"sha256:65c81abbabda7d760df7304d843cc9dbe7ef5d485504ca59a46ae2d1731d2428",
"sha256:7b310a207ee9fde3f46ba327989e6cba4195bc0c8c70a158456e7b10233e6bed",
"sha256:7e6731044f748340ef68dcadb5172a4b1f40847a2983fe3983b2a66445fbc8e6",
"sha256:806e0c7346b9b4af8c62d9a29053f484599921a4448c37fbbcbbf15c25138570",
"sha256:a67fd5914603e2165e075f1b12f5a8356bfb9557e8bfb74511108cfbab0f51ed",
"sha256:e4374a76e61399a173137e7984a1d7e356038cf844f24fd8aea46c8029a2f712",
"sha256:e8a9b9c87801cecaad3b4c2b8876387115d1a14caa602c1618cedbb0cb2a14e6",
"sha256:ea517c2bb11c5e4ba7a83a91482a2837041181d57d3ed0749a6c382a2b6b7086",
"sha256:ec184dfb5d3d11e82841dbb973e7092b75f306b625fad7b2e665b64c5d60ab3f",
"sha256:ff4ad88271aa7a55f19b6a161ed44e088c393846d954729549e3cde8257747bb"
],
"markers": "python_version < '3.8' and implementation_name == 'cpython'",
"version": "==1.5.0"
},
"typing-extensions": {
"hashes": [
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
"sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed",
"sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9"
],
"version": "==3.10.0.2"
"markers": "python_version >= '3.6'",
"version": "==4.0.0"
},
"urllib3": {
"hashes": [
+1
View File
@@ -0,0 +1 @@
Update ``charset-normalizer`` from ``2.0.3`` to ``2.0.7``, this fixes an import error on Python 3.6.
+30 -14
View File
@@ -1,3 +1,4 @@
# -*- coding: utf_8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
@@ -8,24 +9,39 @@ All IANA character set names for which the Python core library provides codecs a
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'.encode('utf_8'))
>>> "utf_8" in results
True
>>> best_result = results.best()
>>> str(best_result)
'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
from pipenv.vendor.charset_normalizer.api import from_fp, from_path, from_bytes, normalize
from pipenv.vendor.charset_normalizer.legacy import detect
from pipenv.vendor.charset_normalizer.version import __version__, VERSION
from pipenv.vendor.charset_normalizer.models import CharsetMatch, CharsetMatches
from .api import from_bytes, from_fp, from_path, normalize
from .legacy import (
CharsetDetector,
CharsetDoctor,
CharsetNormalizerMatch,
CharsetNormalizerMatches,
detect,
)
from .models import CharsetMatch, CharsetMatches
from .version import VERSION, __version__
# Backward-compatible v1 imports
from pipenv.vendor.charset_normalizer.models import CharsetNormalizerMatch
import pipenv.vendor.charset_normalizer.api as CharsetDetector
CharsetNormalizerMatches = CharsetDetector
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"normalize",
"detect",
"CharsetMatch",
"CharsetMatches",
"CharsetNormalizerMatch",
"CharsetNormalizerMatches",
"CharsetDetector",
"CharsetDoctor",
"__version__",
"VERSION",
)
+253 -146
View File
@@ -1,38 +1,48 @@
from os.path import splitext, basename
from typing import List, BinaryIO, Optional, Set, Union
from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set
try:
from os import PathLike
except ImportError:
PathLike = Union[str, 'os.PathLike[str]'] # type: ignore
except ImportError: # pragma: no cover
PathLike = str # type: ignore
from pipenv.vendor.charset_normalizer.constant import TOO_SMALL_SEQUENCE, TOO_BIG_SEQUENCE, IANA_SUPPORTED
from pipenv.vendor.charset_normalizer.md import mess_ratio
from pipenv.vendor.charset_normalizer.models import CharsetMatches, CharsetMatch
from warnings import warn
import logging
from pipenv.vendor.charset_normalizer.utils import any_specified_encoding, is_multi_byte_encoding, identify_sig_or_bom, \
should_strip_sig_or_bom, is_cp_similar, iana_name
from pipenv.vendor.charset_normalizer.cd import coherence_ratio, encoding_languages, mb_encoding_languages, merge_coherence_ratios
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
logger = logging.getLogger("charset_normalizer")
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)s | %(levelname)s | %(message)s'))
handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
logger.addHandler(handler)
def from_bytes(
sequences: bytes,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
explain: bool = False
sequences: bytes,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -49,6 +59,13 @@ def from_bytes(
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {0}".format(
type(sequences)
)
)
if not explain:
logger.setLevel(logging.CRITICAL)
else:
@@ -57,41 +74,38 @@ def from_bytes(
length = len(sequences) # type: int
if length == 0:
logger.warning("Given content is empty, stopping the process very early, returning empty utf_8 str match")
return CharsetMatches(
[
CharsetMatch(
sequences,
"utf_8",
0.,
False,
[],
""
)
]
logger.warning(
"Given content is empty, stopping the process very early, returning empty utf_8 str match"
)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.warning('cp_isolation is set. use this flag for debugging purpose. '
'limited list of encoding allowed : %s.',
', '.join(cp_isolation))
logger.warning(
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.warning(
'cp_exclusion is set. use this flag for debugging purpose. '
'limited list of encoding excluded : %s.',
', '.join(cp_exclusion))
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.warning(
'override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.',
steps, chunk_size, length)
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
@@ -102,15 +116,30 @@ def from_bytes(
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
if is_too_small_sequence:
warn('Trying to detect encoding from a tiny portion of ({}) byte(s).'.format(length))
logger.warning(
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
)
)
elif is_too_large_sequence:
logger.info(
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
)
)
prioritized_encodings = [] # type: List[str]
specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None # type: Optional[str]
specified_encoding = (
any_specified_encoding(sequences) if preemptive_behaviour is True else None
) # type: Optional[str]
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.info('Detected declarative mark in sequence. Priority +1 given for %s.', specified_encoding)
logger.info(
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested = set() # type: Set[str]
tested_but_hard_failure = [] # type: List[str]
@@ -118,9 +147,7 @@ def from_bytes(
fallback_ascii = None # type: Optional[CharsetMatch]
fallback_u8 = None # type: Optional[CharsetMatch]
single_byte_hard_failure_count = 0 # type: int
single_byte_soft_failure_count = 0 # type: int
fallback_specified = None # type: Optional[CharsetMatch]
results = CharsetMatches() # type: CharsetMatches
@@ -128,14 +155,18 @@ def from_bytes(
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.info('Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.', len(sig_payload), sig_encoding)
logger.info(
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings+IANA_SUPPORTED:
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
@@ -150,39 +181,48 @@ def from_bytes(
decoded_payload = None # type: Optional[str]
bom_or_sig_available = sig_encoding == encoding_iana # type: bool
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(encoding_iana) # type: bool
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
) # type: bool
if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
logger.info("Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.", encoding_iana)
logger.info(
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
except (ModuleNotFoundError, ImportError):
logger.debug("Encoding %s does not provide an IncrementalDecoder", encoding_iana)
logger.debug(
"Encoding %s does not provide an IncrementalDecoder", encoding_iana
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[:int(50e4)] if strip_sig_or_bom is False else sequences[len(sig_payload):int(50e4)],
encoding=encoding_iana
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences if strip_sig_or_bom is False else sequences[len(sig_payload):],
encoding=encoding_iana
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.warning(
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
except UnicodeDecodeError as e:
logger.warning('Code page %s does not fit given bytes sequence at ALL. %s', encoding_iana, str(e))
tested_but_hard_failure.append(encoding_iana)
if not is_multi_byte_decoder:
single_byte_hard_failure_count += 1
continue
except LookupError:
tested_but_hard_failure.append(encoding_iana)
if not is_multi_byte_decoder:
single_byte_hard_failure_count += 1
continue
similar_soft_failure_test = False # type: bool
@@ -193,19 +233,31 @@ def from_bytes(
break
if similar_soft_failure_test:
logger.warning("%s is deemed too similar to code page %s and was consider unsuited already. Continuing!", encoding_iana, encoding_soft_failed)
logger.warning(
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if bom_or_sig_available is False else len(sig_payload),
length,
int(length / steps)
int(length / steps),
)
multi_byte_bonus = is_multi_byte_decoder and decoded_payload is not None and len(decoded_payload) < length # type: bool
multi_byte_bonus = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
) # type: bool
if multi_byte_bonus:
logger.info('Code page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes. Should not be a coincidence. Priority +1 given.', encoding_iana)
logger.info(
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up = int(len(r_) / 4) # type: int
@@ -218,62 +270,79 @@ def from_bytes(
md_ratios = []
for i in r_:
cut_sequence = sequences[i:i + chunk_size]
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload+cut_sequence
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
chunk_partial_size_chk = (
16 if chunk_size > 16 else chunk_size
) # type: int
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold
)
)
md_ratios.append(mess_ratio(chunk, threshold))
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (bom_or_sig_available and strip_sig_or_bom is False):
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
if md_ratios:
mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float
else:
mean_mess_ratio = 0.
mean_mess_ratio = 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
if not is_multi_byte_decoder:
single_byte_soft_failure_count += 1
logger.warning('%s was excluded because of initial chaos probing. Gave up %i time(s). '
'Computed mean chaos is %f %%.',
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3))
logger.warning(
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if encoding_iana in ["ascii", "utf_8"]:
if encoding_iana in ["ascii", "utf_8", specified_encoding]:
fallback_entry = CharsetMatch(
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload
sequences, encoding_iana, threshold, False, [], decoded_payload
)
if encoding_iana == "ascii":
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.info(
'%s passed initial chaos probing. Mean measured chaos is %f %%',
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3)
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
@@ -282,21 +351,29 @@ def from_bytes(
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.info("{} should target any language(s) of {}".format(encoding_iana, str(target_languages)))
logger.info(
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
)
)
cd_ratios = []
for chunk in md_chunks:
chunk_languages = coherence_ratio(chunk, 0.1, ",".join(target_languages) if target_languages else None)
cd_ratios.append(
chunk_languages
chunk_languages = coherence_ratio(
chunk, 0.1, ",".join(target_languages) if target_languages else None
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.info("We detected language {} using {}".format(cd_ratios_merged, encoding_iana))
logger.info(
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
)
)
results.append(
CharsetMatch(
@@ -305,37 +382,46 @@ def from_bytes(
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
decoded_payload
decoded_payload,
)
)
if encoding_iana in [specified_encoding, "ascii", "utf_8"] and mean_mess_ratio < 0.1:
logger.info("%s is most likely the one. Stopping the process.", encoding_iana)
return CharsetMatches(
[results[encoding_iana]]
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
logger.info(
"%s is most likely the one. Stopping the process.", encoding_iana
)
return CharsetMatches([results[encoding_iana]])
if encoding_iana == sig_encoding:
logger.info(
"%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
encoding_iana
)
return CharsetMatches(
[results[encoding_iana]]
)
if results[-1].languages:
logger.info(
"Using %s code page we detected the following languages: %s",
encoding_iana,
results[-1]._languages
)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii:
logger.warning("Nothing got out of the detection process. Using ASCII/UTF-8 fallback.")
if fallback_u8 or fallback_ascii or fallback_specified:
logger.warning(
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback."
)
if (fallback_u8 and fallback_ascii is None) or (fallback_u8 and fallback_u8.fingerprint != fallback_ascii.fingerprint):
if fallback_specified:
logger.warning(
"%s will be used as a fallback match", fallback_specified.encoding
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.warning("utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
@@ -346,14 +432,14 @@ def from_bytes(
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
explain: bool = False
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -367,29 +453,46 @@ def from_fp(
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain
explain,
)
def from_path(
path: PathLike,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
explain: bool = False
path: PathLike,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, 'rb') as fp:
return from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
)
def normalize(path: PathLike, steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, cp_isolation: List[str] = None, cp_exclusion: List[str] = None, preemptive_behaviour: bool = True) -> CharsetMatch:
def normalize(
path: PathLike,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: List[str] = None,
cp_exclusion: List[str] = None,
preemptive_behaviour: bool = True,
) -> CharsetMatch:
"""
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
"""
@@ -400,22 +503,26 @@ def normalize(path: PathLike, steps: int = 5, chunk_size: int = 512, threshold:
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour
preemptive_behaviour,
)
filename = basename(path)
target_extensions = list(splitext(filename))
if len(results) == 0:
raise IOError('Unable to normalize "{}", no encoding charset seems to fit.'.format(filename))
raise IOError(
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
filename
)
)
result = results.best()
target_extensions[0] += '-' + result.encoding # type: ignore
target_extensions[0] += "-" + result.encoding # type: ignore
with open('{}'.format(path.replace(filename, ''.join(target_extensions))), 'wb') as fp:
fp.write(
result.output() # type: ignore
)
with open(
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
) as fp:
fp.write(result.output()) # type: ignore
return result # type: ignore
File diff suppressed because it is too large Load Diff
+138 -58
View File
@@ -1,13 +1,20 @@
from codecs import IncrementalDecoder
from functools import lru_cache
from typing import List, Set, Optional, Tuple, Dict
import importlib
from codecs import IncrementalDecoder
from collections import Counter, OrderedDict
from functools import lru_cache
from typing import Dict, List, Optional, Tuple
from pipenv.vendor.charset_normalizer.models import CoherenceMatches
from pipenv.vendor.charset_normalizer.utils import unicode_range, is_unicode_range_secondary, is_multi_byte_encoding
from pipenv.vendor.charset_normalizer.md import is_suspiciously_successive_range
from pipenv.vendor.charset_normalizer.assets import FREQUENCIES
from collections import Counter
from .assets import FREQUENCIES
from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> List[str]:
@@ -17,15 +24,14 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")
decoder = importlib.import_module('encodings.{}'.format(iana_name)).IncrementalDecoder # type: ignore
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
p = decoder(errors="ignore") # type: IncrementalDecoder
seen_ranges = set() # type: Set[str]
seen_ranges = {} # type: Dict[str, int]
character_count = 0 # type: int
for i in range(48, 255):
chunk = p.decode(
bytes([i])
) # type: str
for i in range(0x40, 0xFF):
chunk = p.decode(bytes([i])) # type: str
if chunk:
character_range = unicode_range(chunk) # type: Optional[str]
@@ -34,9 +40,18 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
continue
if is_unicode_range_secondary(character_range) is False:
seen_ranges.add(character_range)
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(list(seen_ranges))
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> List[str]:
@@ -74,42 +89,78 @@ def encoding_languages(iana_name: str) -> List[str]:
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if iana_name.startswith("shift_") or iana_name.startswith("iso2022_jp") or iana_name.startswith("euc_j") or iana_name in {"cp932"}:
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in {"big5", "cp950", "big5hkscs"}:
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese", "Classical Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in {"johab", "cp949", "euc_kr"}:
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
def alphabet_languages(characters: List[str]) -> List[str]:
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages = [] # type: List[str]
languages = [] # type: List[Tuple[str, float]]
source_have_accents = False # type: bool
for character in characters:
if is_accentuated(character):
source_have_accents = True
break
for language, language_characters in FREQUENCIES.items():
character_match_count = 0 # type: int
target_have_accents = False # type: bool
target_pure_latin = True # type: bool
for language_character in language_characters:
if target_have_accents is False and is_accentuated(language_character):
target_have_accents = True
if target_pure_latin is True and is_latin(language_character) is False:
target_pure_latin = False
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count = len(language_characters) # type: int
for character in language_characters:
if character in characters:
character_match_count += 1
character_match_count = len(
[c for c in language_characters if c in characters]
) # type: int
if character_match_count / character_count >= 0.2:
languages.append(language)
ratio = character_match_count / character_count # type: float
return languages
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(language: str, ordered_characters: List[str]) -> float:
def characters_popularity_compare(
language: str, ordered_characters: List[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
@@ -124,14 +175,30 @@ def characters_popularity_compare(language: str, ordered_characters: List[str])
if character not in FREQUENCIES[language]:
continue
characters_before_source = FREQUENCIES[language][0:FREQUENCIES[language].index(character)] # type: List[str]
characters_after_source = FREQUENCIES[language][FREQUENCIES[language].index(character):] # type: List[str]
characters_before_source = FREQUENCIES[language][
0 : FREQUENCIES[language].index(character)
] # type: List[str]
characters_after_source = FREQUENCIES[language][
FREQUENCIES[language].index(character) :
] # type: List[str]
characters_before = ordered_characters[0:ordered_characters.index(character)] # type: List[str]
characters_after = ordered_characters[ordered_characters.index(character):] # type: List[str]
characters_before = ordered_characters[
0 : ordered_characters.index(character)
] # type: List[str]
characters_after = ordered_characters[
ordered_characters.index(character) :
] # type: List[str]
before_match_count = [e in characters_before for e in characters_before_source].count(True) # type: int
after_match_count = [e in characters_after for e in characters_after_source].count(True) # type: int
before_match_count = [
e in characters_before for e in characters_before_source
].count(
True
) # type: int
after_match_count = [
e in characters_after for e in characters_after_source
].count(
True
) # type: int
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
@@ -141,7 +208,10 @@ def characters_popularity_compare(language: str, ordered_characters: List[str])
character_approved_count += 1
continue
if before_match_count / len(characters_before_source) >= 0.4 or after_match_count / len(characters_after_source) >= 0.4:
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
@@ -154,18 +224,24 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers = {} # type: Dict[str, str]
layers = OrderedDict() # type: Dict[str, str]
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range = unicode_range(character) # type: str
character_range = unicode_range(character) # type: Optional[str]
if character_range is None:
continue
layer_target_range = None # type: Optional[str]
for discovered_range in layers:
if is_suspiciously_successive_range(discovered_range, character_range) is False:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
@@ -186,7 +262,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios = {} # type: Dict[str, List[float]]
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
merge = [] # type: CoherenceMatches
for result in results:
@@ -195,20 +271,17 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(
ratio
)
per_language_ratios[language].append(ratio)
for language in per_language_ratios:
merge.append(
(
language,
round(
sum(
per_language_ratios[language]
) / len(per_language_ratios[language]),
4
)
sum(per_language_ratios[language])
/ len(per_language_ratios[language]),
4,
),
)
)
@@ -216,21 +289,26 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
@lru_cache(maxsize=2048)
def coherence_ratio(decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None) -> CoherenceMatches:
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results = [] # type: List[Tuple[str, float]]
lg_inclusion_list = [] # type: List[str]
ignore_non_latin = False # type: bool
sufficient_match_count = 0 # type: int
if lg_inclusion is not None:
lg_inclusion = lg_inclusion.split(",")
lg_inclusion_list = lg_inclusion.split(",")
if lg_inclusion is not None and "Latin Based" in lg_inclusion:
lg_inclusion.remove("Latin Based")
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies = Counter(layer) # type: Counter
@@ -238,22 +316,24 @@ def coherence_ratio(decoded_sequence: str, threshold: float = 0.1, lg_inclusion:
character_count = sum([o for c, o in most_common]) # type: int
if character_count <= 32:
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered = [c for c, o in most_common] # type: List[str]
for language in lg_inclusion or alphabet_languages(popular_character_ordered):
ratio = characters_popularity_compare(language, popular_character_ordered) # type: float
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio = characters_popularity_compare(
language, popular_character_ordered
) # type: float
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append(
(language, round(ratio, 4))
)
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
+202 -121
View File
@@ -1,16 +1,16 @@
import argparse
import sys
from os.path import abspath
from json import dumps
from pipenv.vendor.charset_normalizer import from_fp
from pipenv.vendor.charset_normalizer.models import CliDetectionResult
from pipenv.vendor.charset_normalizer.version import __version__
from os.path import abspath
from platform import python_version
from typing import List
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question, default="yes"):
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
@@ -22,8 +22,7 @@ def query_yes_no(question, default="yes"):
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True,
"no": False, "n": False}
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
@@ -36,16 +35,15 @@ def query_yes_no(question, default="yes"):
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == '':
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' "
"(or 'y' or 'n').\n")
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv=None):
def cli_detect(argv: List[str] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
@@ -53,133 +51,215 @@ def cli_detect(argv=None):
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument('files', type=argparse.FileType('rb'), nargs='+', help='File(s) to be analysed')
parser.add_argument('-v', '--verbose', action="store_true", default=False, dest='verbose',
help='Display complementary information about file if any. Stdout will contain logs about the detection process.')
parser.add_argument('-a', '--with-alternative', action="store_true", default=False, dest='alternatives',
help='Output complementary possibilities if any. Top-level JSON WILL be a list.')
parser.add_argument('-n', '--normalize', action="store_true", default=False, dest='normalize',
help='Permit to normalize input file. If not set, program does not write anything.')
parser.add_argument('-m', '--minimal', action="store_true", default=False, dest='minimal',
help='Only output the charset detected to STDOUT. Disabling JSON output.')
parser.add_argument('-r', '--replace', action="store_true", default=False, dest='replace',
help='Replace file when trying to normalize it instead of creating a new one.')
parser.add_argument('-f', '--force', action="store_true", default=False, dest='force',
help='Replace file without asking if you are sure, use this flag with caution.')
parser.add_argument('-t', '--threshold', action="store", default=0.1, type=float, dest='threshold',
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.")
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.1,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {}".format(__version__, python_version()),
help="Show version information and exit."
version="Charset-Normalizer {} - Python {}".format(
__version__, python_version()
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
print('Use --replace in addition of --normalize only.', file=sys.stderr)
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
print('Use --force in addition of --replace only.', file=sys.stderr)
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0. or args.threshold > 1.:
print('--threshold VALUE should be between 0. AND 1.', file=sys.stderr)
if args.threshold < 0.0 or args.threshold > 1.0:
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose
)
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
if len(matches) == 0:
print('Unable to identify originating encoding for "{}". {}'.format(my_file.name, 'Maybe try increasing maximum amount of chaos.' if args.threshold < 1. else ''), file=sys.stderr)
if my_file.closed is False:
my_file.close()
continue
best_guess = matches.best()
x_ = []
r_ = matches.best()
p_ = r_.first()
x_.append(
CliDetectionResult(
abspath(my_file.name),
p_.encoding,
p_.encoding_aliases,
[cp for cp in p_.could_be_from_charset if cp != p_.encoding],
p_.language,
p_.alphabets,
p_.bom,
p_.percent_chaos,
p_.percent_coherence,
None,
True
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != p_:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[cp for cp in el.could_be_from_charset if cp != el.encoding],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if args.normalize is True:
if p_.encoding.startswith('utf') is True:
print('"{}" file does not need to be normalized, as it already came from unicode.'.format(my_file.name), file=sys.stderr)
if my_file.closed is False:
my_file.close()
continue
o_ = my_file.name.split('.') # type: list[str]
if args.replace is False:
o_.insert(-1, p_.encoding)
if my_file.closed is False:
my_file.close()
else:
if args.force is False and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(my_file.name), 'no') is False:
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = './{}'.format('.'.join(o_))
o_ = my_file.name.split(".") # type: List[str]
with open(x_[0].unicode_path, 'w', encoding='utf-8') as fp:
fp.write(
str(p_)
)
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
else:
if (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
@@ -187,24 +267,25 @@ def cli_detect(argv=None):
if args.minimal is False:
print(
dumps(
[
el.__dict__ for el in x_
] if args.alternatives else x_[0].__dict__,
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4
indent=4,
)
)
else:
print(
', '.join(
[
el.encoding for el in x_
]
for my_file in args.files:
print(
", ".join(
[
el.encoding if el.encoding else "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
)
return 0
if __name__ == '__main__':
if __name__ == "__main__":
cli_detect()
File diff suppressed because one or more lines are too long
+68 -11
View File
@@ -1,7 +1,10 @@
from pipenv.vendor.charset_normalizer.api import from_bytes
from pipenv.vendor.charset_normalizer.constant import CHARDET_CORRESPONDENCE
import warnings
from typing import Dict, Optional, Union
from .api import from_bytes, from_fp, from_path, normalize
from .constant import CHARDET_CORRESPONDENCE
from .models import CharsetMatch, CharsetMatches
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
"""
@@ -14,8 +17,10 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
:param byte_str: The byte sequence to examine.
"""
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: "
"{0}".format(type(byte_str))
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
@@ -23,16 +28,68 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != 'Unknown' else ''
confidence = 1. - r.chaos if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == 'utf_8' and r.bom:
encoding += '_sig'
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
return {
'encoding': encoding if encoding not in CHARDET_CORRESPONDENCE else CHARDET_CORRESPONDENCE[encoding],
'language': language,
'confidence': confidence
"encoding": encoding
if encoding not in CHARDET_CORRESPONDENCE
else CHARDET_CORRESPONDENCE[encoding],
"language": language,
"confidence": confidence,
}
class CharsetNormalizerMatch(CharsetMatch):
pass
class CharsetNormalizerMatches(CharsetMatches):
@staticmethod
def from_fp(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return from_fp(*args, **kwargs) # pragma: nocover
@staticmethod
def from_bytes(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return from_bytes(*args, **kwargs) # pragma: nocover
@staticmethod
def from_path(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return from_path(*args, **kwargs) # pragma: nocover
@staticmethod
def normalize(*args, **kwargs): # type: ignore
warnings.warn( # pragma: nocover
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
"and scheduled to be removed in 3.0",
DeprecationWarning,
)
return normalize(*args, **kwargs) # pragma: nocover
class CharsetDetector(CharsetNormalizerMatches):
pass
class CharsetDoctor(CharsetNormalizerMatches):
pass
+176 -87
View File
@@ -1,9 +1,24 @@
from functools import lru_cache
from typing import Optional, List
from typing import List, Optional
from pipenv.vendor.charset_normalizer.constant import UNICODE_SECONDARY_RANGE_KEYWORD
from pipenv.vendor.charset_normalizer.utils import is_punctuation, is_symbol, unicode_range, is_accentuated, is_latin, \
remove_accent, is_separator, is_cjk
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
from .utils import (
is_accentuated,
is_ascii,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
remove_accent,
unicode_range,
)
class MessDetectorPlugin:
@@ -41,8 +56,7 @@ class MessDetectorPlugin:
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._punctuation_count = 0 # type: int
self._symbol_count = 0 # type: int
self._character_count = 0 # type: int
@@ -56,10 +70,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def feed(self, character: str) -> None:
self._character_count += 1
if character != self._last_printable_char and character not in ["<", ">", "=", ":", "/", "&", ";", "{", "}", "[", "]"]:
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif character.isdigit() is False and is_symbol(character):
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
@@ -72,16 +93,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.
return 0.0
ratio_of_punctuation = (self._punctuation_count + self._symbol_count) / self._character_count # type: float
ratio_of_punctuation = (
self._punctuation_count + self._symbol_count
) / self._character_count # type: float
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._character_count = 0 # type: int
self._accentuated_count = 0 # type: int
@@ -101,14 +123,15 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.
ratio_of_accentuation = self._accentuated_count / self._character_count # type: float
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.
return 0.0
ratio_of_accentuation = (
self._accentuated_count / self._character_count
) # type: float
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._unprintable_count = 0 # type: int
self._character_count = 0 # type: int
@@ -116,7 +139,11 @@ class UnprintablePlugin(MessDetectorPlugin):
return True
def feed(self, character: str) -> None:
if character not in {'\n', '\t', '\r'} and character.isprintable() is False:
if (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
self._unprintable_count += 1
self._character_count += 1
@@ -126,26 +153,31 @@ class UnprintablePlugin(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._successive_count = 0 # type: int
self._character_count = 0 # type: int
self._last_latin_character = None # type: Optional[str]
def eligible(self, character: str) -> bool:
return is_latin(character)
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if self._last_latin_character is not None:
if is_accentuated(character) and is_accentuated(self._last_latin_character):
if remove_accent(character) == remove_accent(self._last_latin_character):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(
self._last_latin_character
):
self._successive_count += 1
self._last_latin_character = character
@@ -157,14 +189,13 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._suspicious_successive_range_count = 0 # type: int
self._character_count = 0 # type: int
self._last_printable_seen = None # type: Optional[str]
@@ -175,15 +206,21 @@ class SuspiciousRange(MessDetectorPlugin):
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
if character.isspace() or is_punctuation(character):
self._last_printable_seen = None
return
unicode_range_a = unicode_range(self._last_printable_seen) # type: Optional[str]
unicode_range_a = unicode_range(
self._last_printable_seen
) # type: Optional[str]
unicode_range_b = unicode_range(character) # type: Optional[str]
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
@@ -199,22 +236,24 @@ class SuspiciousRange(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.
return 0.0
ratio_of_suspicious_range_usage = (self._suspicious_successive_range_count * 2) / self._character_count # type: float
ratio_of_suspicious_range_usage = (
self._suspicious_successive_range_count * 2
) / self._character_count # type: float
if ratio_of_suspicious_range_usage < 0.1:
return 0.
return 0.0
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._word_count = 0 # type: int
self._bad_word_count = 0 # type: int
self._is_current_word_bad = False # type: bool
self._foreign_long_watch = False # type: bool
self._character_count = 0 # type: int
self._bad_character_count = 0 # type: int
@@ -230,16 +269,30 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
self._buffer = "".join([self._buffer, character])
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and is_latin(character) is False
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
return
if not self._buffer:
return
if (character.isspace() or is_punctuation(character) or is_separator(character)) and self._buffer:
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length = len(self._buffer) # type: int
self._character_count += buffer_length
if buffer_length >= 4 and self._buffer_accent_count / buffer_length >= 0.3:
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
self._is_current_word_bad = True
if self._is_current_word_bad:
@@ -247,15 +300,21 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
elif character not in {"<", ">", "-", "="} and character.isdigit() is False and is_symbol(character):
elif (
character not in {"<", ">", "-", "="}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None:
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
@@ -263,19 +322,19 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._word_count <= 16:
return 0.
if self._word_count <= 10:
return 0.0
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected.
Searching for the overuse of '' and ''.
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
"""
def __init__(self):
def __init__(self) -> None:
self._wrong_stop_count = 0 # type: int
self._cjk_character_count = 0 # type: int
@@ -296,13 +355,12 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
return 0.
return 0.0
return self._wrong_stop_count / self._cjk_character_count
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self):
def __init__(self) -> None:
self._buf = False # type: bool
self._character_count_since_last_sep = 0 # type: int
@@ -313,27 +371,51 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
self._character_count = 0 # type: int
self._last_alpha_seen = None # type: Optional[str]
self._current_ascii_only = True # type: bool
def eligible(self, character: str) -> bool:
return character.isspace() or character.isalpha()
return True
def feed(self, character: str) -> None:
if is_separator(character):
if self._character_count_since_last_sep < 24:
self._successive_upper_lower_count_final += self._successive_upper_lower_count
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and is_ascii(character) is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (character.islower() and self._last_alpha_seen.isupper()):
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 1
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None:
@@ -342,16 +424,20 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.
return 0.0
return (self._successive_upper_lower_count_final * 2) / self._character_count
return self._successive_upper_lower_count_final / self._character_count
def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_range_b: Optional[str]) -> bool:
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
@@ -367,7 +453,9 @@ def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_ran
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
keywords_range_a, keywords_range_b = unicode_range_a.split(" "), unicode_range_b.split(" ")
keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
), unicode_range_b.split(" ")
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
@@ -376,12 +464,19 @@ def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_ran
return False
# Japanese Exception
if unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']:
return False
if unicode_range_a in ['Katakana', 'Hiragana'] or unicode_range_b in ['Katakana', 'Hiragana']:
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if range_a_jp_chars or range_b_jp_chars:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
@@ -390,30 +485,33 @@ def is_suspiciously_successive_range(unicode_range_a: Optional[str], unicode_ran
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ('CJK' in unicode_range_a or 'CJK' in unicode_range_b) or (unicode_range_a in ['Katakana', 'Hiragana'] and unicode_range_b in ['Katakana', 'Hiragana']):
if 'Punctuation' in unicode_range_a or 'Punctuation' in unicode_range_b:
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if 'Forms' in unicode_range_a or 'Forms' in unicode_range_b:
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False) -> float:
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors = [] # type: List[MessDetectorPlugin]
for md_class in MessDetectorPlugin.__subclasses__():
detectors.append(
md_class()
)
detectors = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
] # type: List[MessDetectorPlugin]
length = len(decoded_sequence) # type: int
mean_mess_ratio = 0. # type: float
mean_mess_ratio = 0.0 # type: float
if length < 512:
intermediary_mean_mess_ratio_calc = 32 # type: int
@@ -427,25 +525,16 @@ def mess_ratio(decoded_sequence: str, maximum_threshold: float = 0.2, debug: boo
if detector.eligible(character):
detector.feed(character)
if (index > 0 and index % intermediary_mean_mess_ratio_calc == 0) or index == length-1:
mean_mess_ratio = sum(
[
dt.ratio for dt in detectors
]
)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum([dt.ratio for dt in detectors])
if mean_mess_ratio >= maximum_threshold:
break
if debug:
for dt in detectors: # pragma: nocover
print(
dt.__class__,
dt.ratio
)
return round(
mean_mess_ratio,
3
)
print(dt.__class__, dt.ratio)
return round(mean_mess_ratio, 3)
+102 -64
View File
@@ -1,25 +1,25 @@
import warnings
from collections import Counter
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from typing import Optional, List, Tuple, Set
from collections import Counter
from re import sub, compile as re_compile
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from pipenv.vendor.charset_normalizer.constant import TOO_BIG_SEQUENCE
from pipenv.vendor.charset_normalizer.md import mess_ratio
from pipenv.vendor.charset_normalizer.utils import iana_name, is_multi_byte_encoding, unicode_range
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
from .md import mess_ratio
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
):
self._payload = payload # type: bytes
@@ -30,19 +30,23 @@ class CharsetMatch:
self._unicode_ranges = None # type: Optional[List[str]]
self._leaves = [] # type: List[CharsetMatch]
self._mean_coherence_ratio = 0. # type: float
self._mean_coherence_ratio = 0.0 # type: float
self._output_payload = None # type: Optional[bytes]
self._output_encoding = None # type: Optional[str]
self._string = decoded_payload # type: Optional[str]
def __eq__(self, other) -> bool:
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
raise TypeError('__eq__ cannot be invoked on {} and {}.'.format(str(other.__class__), str(self.__class__)))
raise TypeError(
"__eq__ cannot be invoked on {} and {}.".format(
str(other.__class__), str(self.__class__)
)
)
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other) -> bool:
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
@@ -50,13 +54,21 @@ class CharsetMatch:
raise ValueError
chaos_difference = abs(self.chaos - other.chaos) # type: float
coherence_difference = abs(self.coherence - other.coherence) # type: float
# Bellow 1% difference --> Use Coherence
if chaos_difference < 0.01:
if chaos_difference < 0.01 and coherence_difference > 0.02:
# When having a tough decision, use the result that decoded as many multi-byte as possible.
if chaos_difference == 0.0 and self.coherence == other.coherence:
return self.multi_byte_usage > other.multi_byte_usage
return self.coherence > other.coherence
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - len(str(self)) / len(self.raw)
@property
def chaos_secondary_pass(self) -> float:
"""
@@ -64,11 +76,11 @@ class CharsetMatch:
Use with caution, this can be very slow.
Notice: Will be removed in 3.0
"""
warnings.warn("chaos_secondary_pass is deprecated and will be removed in 3.0", DeprecationWarning)
return mess_ratio(
str(self),
1.
warnings.warn(
"chaos_secondary_pass is deprecated and will be removed in 3.0",
DeprecationWarning,
)
return mess_ratio(str(self), 1.0)
@property
def coherence_non_latin(self) -> float:
@@ -76,8 +88,11 @@ class CharsetMatch:
Coherence ratio on the first non-latin language detected if ANY.
Notice: Will be removed in 3.0
"""
warnings.warn("coherence_non_latin is deprecated and will be removed in 3.0", DeprecationWarning)
return 0.
warnings.warn(
"coherence_non_latin is deprecated and will be removed in 3.0",
DeprecationWarning,
)
return 0.0
@property
def w_counter(self) -> Counter:
@@ -85,9 +100,11 @@ class CharsetMatch:
Word counter instance on decoded text.
Notice: Will be removed in 3.0
"""
warnings.warn("w_counter is deprecated and will be removed in 3.0", DeprecationWarning)
not_printable_pattern = re_compile(r'[0-9\W\n\r\t]+')
string_printable_only = sub(not_printable_pattern, ' ', str(self).lower())
warnings.warn(
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
)
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
return Counter(string_printable_only.split())
@@ -102,7 +119,11 @@ class CharsetMatch:
def add_submatch(self, other: "CharsetMatch") -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError("Unable to add instance <{}> as a submatch of a CharsetMatch".format(other.__class__))
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@@ -153,9 +174,13 @@ class CharsetMatch:
return "English"
# doing it there to avoid circular import
from pipenv.vendor.charset_normalizer.cd import mb_encoding_languages, encoding_languages
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = mb_encoding_languages(self.encoding) if is_multi_byte_encoding(self.encoding) else encoding_languages(self.encoding)
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
@@ -171,7 +196,7 @@ class CharsetMatch:
@property
def coherence(self) -> float:
if not self._languages:
return 0.
return 0.0
return self._languages[0][1]
@property
@@ -201,12 +226,12 @@ class CharsetMatch:
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
detected_ranges = set() # type: Set[str]
for character in str(self):
detected_ranges.add(
unicode_range(character)
)
self._unicode_ranges = sorted(list(detected_ranges))
# list detected ranges
detected_ranges = [
unicode_range(char) for char in str(self)
] # type: List[Optional[str]]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
@@ -254,14 +279,15 @@ class CharsetMatches:
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: List[CharsetMatch] = None):
self._results = sorted(results) if results else [] # type: List[CharsetMatch]
def __iter__(self):
def __iter__(self) -> Iterator[CharsetMatch]:
for result in self._results:
yield result
def __getitem__(self, item) -> CharsetMatch:
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
@@ -278,17 +304,24 @@ class CharsetMatches:
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError("Cannot append instance '{}' to CharsetMatches".format(str(item.__class__)))
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) <= TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
@@ -314,11 +347,23 @@ CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(self, path: str, encoding: str, encoding_aliases: List[str], alternative_encodings: List[str], language: str, alphabets: List[str], has_sig_or_bom: bool, chaos: float, coherence: float, unicode_path: Optional[str], is_preferred: bool):
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
language: str,
alphabets: List[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
is_preferred: bool,
):
self.path = path # type: str
self.unicode_path = unicode_path # type: Optional[str]
self.encoding = encoding # type: str
self.encoding = encoding # type: Optional[str]
self.encoding_aliases = encoding_aliases # type: List[str]
self.alternative_encodings = alternative_encodings # type: List[str]
self.language = language # type: str
@@ -329,27 +374,20 @@ class CliDetectionResult:
self.is_preferred = is_preferred # type: bool
@property
def __dict__(self):
def __dict__(self) -> Dict[str, Any]: # type: ignore
return {
'path': self.path,
'encoding': self.encoding,
'encoding_aliases': self.encoding_aliases,
'alternative_encodings': self.alternative_encodings,
'language': self.language,
'alphabets': self.alphabets,
'has_sig_or_bom': self.has_sig_or_bom,
'chaos': self.chaos,
'coherence': self.coherence,
'unicode_path': self.unicode_path,
'is_preferred': self.is_preferred
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(
self.__dict__,
ensure_ascii=True,
indent=4
)
CharsetNormalizerMatch = CharsetMatch
return dumps(self.__dict__, ensure_ascii=True, indent=4)
+114 -29
View File
@@ -1,19 +1,25 @@
try:
import unicodedata2 as unicodedata
except ImportError:
import unicodedata
import unicodedata # type: ignore[no-redef]
from codecs import IncrementalDecoder
from re import findall
from typing import Optional, Tuple, Union, List, Set
import importlib
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import List, Optional, Set, Tuple, Union
from pipenv.vendor.charset_normalizer.constant import UNICODE_RANGES_COMBINED, UNICODE_SECONDARY_RANGE_KEYWORD, \
RE_POSSIBLE_ENCODING_INDICATION, ENCODING_MARKS, UTF8_MAXIMAL_ALLOCATION, IANA_SUPPORTED_SIMILAR
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -22,7 +28,14 @@ def is_accentuated(character: str) -> bool:
description = unicodedata.name(character) # type: str
except ValueError:
return False
return "WITH GRAVE" in description or "WITH ACUTE" in description or "WITH CEDILLA" in description
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -33,12 +46,7 @@ def remove_accent(character: str) -> str:
codes = decomposed.split(" ") # type: List[str]
return chr(
int(
codes[0],
16
)
)
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@@ -64,6 +72,14 @@ def is_latin(character: str) -> bool:
return "LATIN" in description
def is_ascii(character: str) -> bool:
try:
character.encode("ascii")
except UnicodeEncodeError:
return False
return True
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category = unicodedata.category(character) # type: str
@@ -94,9 +110,19 @@ def is_symbol(character: str) -> bool:
return "Forms" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range = unicode_range(character) # type: Optional[str]
if character_range is None:
return False
return "Emoticons" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in ["", "+"]:
if character.isspace() or character in ["", "+", ",", ";", "<", ">"]:
return True
character_category = unicodedata.category(character) # type: str
@@ -104,12 +130,18 @@ def is_separator(character: str) -> bool:
return "Z" in character_category
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
def is_private_use_only(character: str) -> bool:
character_category = unicodedata.category(character) # type: str
return "Co" == character_category
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
@@ -119,6 +151,46 @@ def is_cjk(character: str) -> bool:
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "THAI" in character_name
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
@@ -139,14 +211,16 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
results = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[:seq_len if seq_len <= search_zone else search_zone].decode('ascii', errors='ignore')
sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
"ascii", errors="ignore"
),
) # type: List[str]
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace('-', '_')
specified_encoding = specified_encoding.lower().replace("-", "_")
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
@@ -162,9 +236,19 @@ def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {"utf_8", "utf_8_sig", "utf_16", "utf_16_be", "utf_16_le", "utf_32", "utf_32_le", "utf_32_be", "utf_7"} or issubclass(
importlib.import_module('encodings.{}'.format(name)).IncrementalDecoder, # type: ignore
MultibyteIncrementalDecoder
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
MultibyteIncrementalDecoder,
)
@@ -191,7 +275,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace('-', '_')
cp_name = cp_name.lower().replace("-", "_")
for encoding_alias, encoding_iana in aliases.items():
if cp_name == encoding_alias or cp_name == encoding_iana:
@@ -212,9 +296,7 @@ def range_scan(decoded_sequence: str) -> List[str]:
if character_range is None:
continue
ranges.add(
character_range
)
ranges.add(character_range)
return list(ranges)
@@ -222,10 +304,10 @@ def range_scan(decoded_sequence: str) -> List[str]:
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.
return 0.0
decoder_a = importlib.import_module('encodings.{}'.format(iana_name_a)).IncrementalDecoder # type: ignore
decoder_b = importlib.import_module('encodings.{}'.format(iana_name_b)).IncrementalDecoder # type: ignore
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
@@ -245,4 +327,7 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return iana_name_a in IANA_SUPPORTED_SIMILAR and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
+2 -2
View File
@@ -2,5 +2,5 @@
Expose version
"""
__version__ = "2.0.3"
VERSION = __version__.split('.')
__version__ = "2.0.7"
VERSION = __version__.split(".")
+1 -1
View File
@@ -3,7 +3,7 @@ attrs==21.2.0
cached-property==1.5.2
cerberus==1.3.4
certifi==2021.5.30
charset-normalizer==2.0.3
charset-normalizer==2.0.7
click-didyoumean==0.0.3
click==8.0.3
colorama==0.4.4