From 511561122bd261b1ea045ac28bcfb2c1538bf2e9 Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Thu, 15 Mar 2018 03:12:01 -0700 Subject: [PATCH 1/5] Cache pipfile parsing On a (390+ line) Pipfile, it takes ~5s to parse the entire thing :O. Pipenv has to parse the pipfile repeatedly and all over the place, so caching the contents speeds things up dramatically (at least in this case). This PR establishes a little cache based upon file location + md5sum of contents for the pipfile (and the hashing is pretty fast here). Given that the cache key is based on the file contents, should be completely fine to do (only possible issue is if parsed_pipfile gets mutated - which is why I've added a defensive deepcopy). Without the deepcopy, cache hit takes ~0.09ms. With the deepcopy, cache hit takes 1.19ms. If we can confirm no need for deepcopy, would shave off a second or two off really big Pipfiles. --- pipenv/project.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pipenv/project.py b/pipenv/project.py index bdb1b98b..225914b6 100644 --- a/pipenv/project.py +++ b/pipenv/project.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import copy import json import os import re @@ -10,6 +11,7 @@ import hashlib import contoml import delegator import pipfile +import threading import toml from pip9 import ConfigOptionParser @@ -47,6 +49,10 @@ if PIPENV_PIPFILE: PIPENV_PIPFILE = normalize_drive(os.path.abspath(PIPENV_PIPFILE)) +_cache = threading.local() +_cache.pipfile_cache = {} + + class Project(object): """docstring for Project""" @@ -292,6 +298,15 @@ class Project(object): # Open the pipfile, read it into memory. with open(self.pipfile_location) as f: contents = f.read() + # this should be pretty fast (ish) and we need this pipfile a lot + cache_key = (self.pipfile_location, hashlib.md5(contents.encode('utf8')).hexdigest()) + if cache_key not in _cache.pipfile_cache: + parsed = self._parse_pipfile(contents) + _cache.pipfile_cache[cache_key] = parsed + # deepcopy likely unnecessary but why not avoid bugs? + return copy.deepcopy(_cache.pipfile_cache[cache_key]) + + def _parse_pipfile(self, contents): # If any outline tables are present... if ('[packages.' in contents) or ('[dev-packages.' in contents): data = toml.loads(contents) From bde5bb35c7e85bc27fdb57d76afc608187e87774 Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Thu, 15 Mar 2018 03:47:04 -0700 Subject: [PATCH 2/5] Apparently you cannot deepcopy TOMLFile objects Weirdly (seems like) it only fails in test suite but not in actual usage, but manual vetting suggests we don't modify the pipfile without writing it to disk pretty much immediately. --- pipenv/project.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipenv/project.py b/pipenv/project.py index 225914b6..51cfba08 100644 --- a/pipenv/project.py +++ b/pipenv/project.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import copy import json import os import re @@ -304,7 +303,7 @@ class Project(object): parsed = self._parse_pipfile(contents) _cache.pipfile_cache[cache_key] = parsed # deepcopy likely unnecessary but why not avoid bugs? - return copy.deepcopy(_cache.pipfile_cache[cache_key]) + return _cache.pipfile_cache[cache_key] def _parse_pipfile(self, contents): # If any outline tables are present... From dfb1a19dcf025ef100a14f30568c8d2dc1689026 Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Thu, 15 Mar 2018 03:53:39 -0700 Subject: [PATCH 3/5] Clear pipfile cache when it seems like we might be mutating --- pipenv/core.py | 1 + pipenv/project.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pipenv/core.py b/pipenv/core.py index 00fe1661..3cb8b6cd 100644 --- a/pipenv/core.py +++ b/pipenv/core.py @@ -291,6 +291,7 @@ def ensure_pipfile(validate=True, skip_requirements=False): if validate and project.virtualenv_exists and not PIPENV_SKIP_VALIDATION: # Ensure that Pipfile is using proper casing. p = project.parsed_pipfile + p.clear_pipfile_cache() changed = ensure_proper_casing(pfile=p) # Write changes out to disk. if changed: diff --git a/pipenv/project.py b/pipenv/project.py index 51cfba08..d90ffc12 100644 --- a/pipenv/project.py +++ b/pipenv/project.py @@ -294,6 +294,9 @@ class Project(object): @property def parsed_pipfile(self): + """Parse Pipfile into a TOMLFile and cache it + + (call clear_pipfile_cache() afterwards if mutating)""" # Open the pipfile, read it into memory. with open(self.pipfile_location) as f: contents = f.read() @@ -302,9 +305,12 @@ class Project(object): if cache_key not in _cache.pipfile_cache: parsed = self._parse_pipfile(contents) _cache.pipfile_cache[cache_key] = parsed - # deepcopy likely unnecessary but why not avoid bugs? return _cache.pipfile_cache[cache_key] + def clear_pipfile_cache(self): + """Clear pipfile cache (e.g., so we can mutate parsed pipfile)""" + _cache.pipfile_cache.clear() + def _parse_pipfile(self, contents): # If any outline tables are present... if ('[packages.' in contents) or ('[dev-packages.' in contents): @@ -338,8 +344,10 @@ class Project(object): def _pipfile(self): """Pipfile divided by PyPI and external dependencies.""" pfile = self.parsed_pipfile + # mutation time! + self.clear_pipfile_cache() for section in ('packages', 'dev-packages'): - p_section = pfile.get(section, {}) + p_section = dict(pfile.get(section, {})) for key in list(p_section.keys()): # Normalize key name to PEP 423. norm_key = pep423_name(key) @@ -370,6 +378,7 @@ class Project(object): p['pipenv'] = settings # Write the changes to disk. self.write_toml(p) + self.clear_pipfile_cache() @property def _lockfile(self): From 3925f7a537478dda622a3a745272e1d64cfcc4ec Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Fri, 16 Mar 2018 09:43:54 -0700 Subject: [PATCH 4/5] Do not bother hashing --- pipenv/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipenv/project.py b/pipenv/project.py index d90ffc12..eb33c169 100644 --- a/pipenv/project.py +++ b/pipenv/project.py @@ -300,8 +300,8 @@ class Project(object): # Open the pipfile, read it into memory. with open(self.pipfile_location) as f: contents = f.read() - # this should be pretty fast (ish) and we need this pipfile a lot - cache_key = (self.pipfile_location, hashlib.md5(contents.encode('utf8')).hexdigest()) + # use full contents to get around str/bytes 2/3 issues + cache_key = (self.pipfile_location, contents) if cache_key not in _cache.pipfile_cache: parsed = self._parse_pipfile(contents) _cache.pipfile_cache[cache_key] = parsed From 8a086f175a9642d04b75f481a8192b1ef08e4323 Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Mon, 19 Mar 2018 09:57:47 -0700 Subject: [PATCH 5/5] Stop using a threading.local --- pipenv/project.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pipenv/project.py b/pipenv/project.py index eb33c169..968c371b 100644 --- a/pipenv/project.py +++ b/pipenv/project.py @@ -10,7 +10,6 @@ import hashlib import contoml import delegator import pipfile -import threading import toml from pip9 import ConfigOptionParser @@ -48,8 +47,9 @@ if PIPENV_PIPFILE: PIPENV_PIPFILE = normalize_drive(os.path.abspath(PIPENV_PIPFILE)) -_cache = threading.local() -_cache.pipfile_cache = {} +# (path, file contents) => TOMLFile +# keeps track of pipfiles that we've seen so we do not need to re-parse 'em +_pipfile_cache = {} class Project(object): @@ -302,14 +302,14 @@ class Project(object): contents = f.read() # use full contents to get around str/bytes 2/3 issues cache_key = (self.pipfile_location, contents) - if cache_key not in _cache.pipfile_cache: + if cache_key not in _pipfile_cache: parsed = self._parse_pipfile(contents) - _cache.pipfile_cache[cache_key] = parsed - return _cache.pipfile_cache[cache_key] + _pipfile_cache[cache_key] = parsed + return _pipfile_cache[cache_key] def clear_pipfile_cache(self): """Clear pipfile cache (e.g., so we can mutate parsed pipfile)""" - _cache.pipfile_cache.clear() + _pipfile_cache.clear() def _parse_pipfile(self, contents): # If any outline tables are present...