add encoding parameter to all file open() calls in code samples, example files, and text

2026-06-05 23:10:17 +00:00 · 2009-07-16 12:36:37 -04:00
parent 49fae282ec
commit e35d9d1bda
8 changed files with 24 additions and 23 deletions
@@ -309,19 +309,19 @@ StopIteration</samp>

 <p class=d>[<a href=examples/favorite-people.txt>download <code>favorite-people.txt</code></a>]
 <pre class=screen>
-<a><samp class=p>>>> </samp><kbd class=pp>names = list(open('examples/favorite-people.txt'))</kbd>  <span class=u>&#x2460;</span></a>
+<a><samp class=p>>>> </samp><kbd class=pp>names = list(open('examples/favorite-people.txt', encoding='utf-8'))</kbd>  <span class=u>&#x2460;</span></a>
 <samp class=p>>>> </samp><kbd class=pp>names</kbd>
 <samp class=pp>['Dora\n', 'Ethan\n', 'Wesley\n', 'John\n', 'Anne\n',
 'Mike\n', 'Chris\n', 'Sarah\n', 'Alex\n', 'Lizzie\n']</samp>
-<a><samp class=p>>>> </samp><kbd class=pp>names = [name.rstrip() for name in names]</kbd>           <span class=u>&#x2461;</span></a>
+<a><samp class=p>>>> </samp><kbd class=pp>names = [name.rstrip() for name in names]</kbd>                             <span class=u>&#x2461;</span></a>
 <samp class=p>>>> </samp><kbd class=pp>names</kbd>
 <samp class=pp>['Dora', 'Ethan', 'Wesley', 'John', 'Anne',
 'Mike', 'Chris', 'Sarah', 'Alex', 'Lizzie']</samp>
-<a><samp class=p>>>> </samp><kbd class=pp>names = sorted(names)</kbd>                               <span class=u>&#x2462;</span></a>
+<a><samp class=p>>>> </samp><kbd class=pp>names = sorted(names)</kbd>                                                 <span class=u>&#x2462;</span></a>
 <samp class=p>>>> </samp><kbd class=pp>names</kbd>
 <samp class=pp>['Alex', 'Anne', 'Chris', 'Dora', 'Ethan',
 'John', 'Lizzie', 'Mike', 'Sarah', 'Wesley']</samp>
-<a><samp class=p>>>> </samp><kbd class=pp>names = sorted(names, key=len)</kbd>                      <span class=u>&#x2463;</span></a>
+<a><samp class=p>>>> </samp><kbd class=pp>names = sorted(names, key=len)</kbd>                                        <span class=u>&#x2463;</span></a>
 <samp class=p>>>> </samp><kbd class=pp>names</kbd>
 <samp class=pp>['Alex', 'Anne', 'Dora', 'John', 'Mike',
 'Chris', 'Ethan', 'Sarah', 'Lizzie', 'Wesley']</samp></pre>
@@ -13,6 +13,7 @@
 * TODO 2nd draft Refactoring
 * TODO 1st draft Advanced Classes
 * TODO 1st draft Files
+  SCHEDULED: <2009-07-16 Thu>
 ** Reading from text files
 *** Opening a file (to read)
 *** Character encoding
@@ -15,7 +15,7 @@ def build_match_and_apply_functions(pattern, search, replace):
    return [matches_rule, apply_rule]

 rules = []
-with open('plural4-rules.txt') as pattern_file:
+with open('plural4-rules.txt', encoding='utf-8') as pattern_file:
    for line in pattern_file:
        pattern, search, replace = line.split(None, 3)
        rules.append(build_match_and_apply_functions(
@@ -15,7 +15,7 @@ def build_match_and_apply_functions(pattern, search, replace):
    return [matches_rule, apply_rule]

 def rules(rules_filename):
-    with open(rules_filename) as pattern_file:
+    with open(rules_filename, encoding='utf-8') as pattern_file:
        for line in pattern_file:
            pattern, search, replace = line.split(None, 3)
            yield build_match_and_apply_functions(pattern, search, replace)
@@ -18,7 +18,7 @@ class LazyRules:
    rules_filename = 'plural6-rules.txt'

    def __iter__(self):
-        self.pattern_file = open(self.rules_filename)
+        self.pattern_file = open(self.rules_filename, encoding='utf-8')
        self.cache = []
        self.cache_index = 0
        return self
@@ -22,14 +22,6 @@ body{counter-reset:h1 12}
 <h2 id=divingin>Diving In</h2>
 <p class=f>FIXME

-<!--
-FIXME move this to character encoding section
-
-OK, so a string is a sequence of Unicode characters. But a file on disk is not a sequence of Unicode characters; a file on disk is a sequence of bytes. So if you read a &#8220;text file&#8221; from disk, how does Python convert that sequence of bytes into a sequence of characters? The answer is that it decodes the bytes according to a specific character encoding algorithm, and returns a sequence of Unicode characters, otherwise known as a string.
-
-"The default encoding is platform dependent (whatever locale.getpreferredencoding() returns)." -- http://docs.python.org/3.1/library/io.html
-->
-
 <h2 id=reading-from-text-files>Reading From Text Files</h2>

 <p>FIXME
@@ -41,7 +33,11 @@ open(..., 'r', encoding='...')

 <h3 id=encoding>Character Encoding Rears Its Ugly Head</h3>

-<p>FIXME
+<!--
+OK, so a string is a sequence of Unicode characters. But a file on disk is not a sequence of Unicode characters; a file on disk is a sequence of bytes. So if you read a &#8220;text file&#8221; from disk, how does Python convert that sequence of bytes into a sequence of characters? The answer is that it decodes the bytes according to a specific character encoding algorithm, and returns a sequence of Unicode characters, otherwise known as a string.
+
+"The default encoding is platform dependent (whatever locale.getpreferredencoding() returns)." -- http://docs.python.org/3.1/library/io.html
+-->

 <h3 id=file-objects>File Objects</h3>

@@ -134,6 +130,10 @@ ValueError: I/O operation on closed file</samp>

 <p>FIXME what's a "line"? (line endings discussion, universal line endings, etc.)

+<!--
+A &#8220;line&#8221; of a text file is just what you think it is&nbsp;&mdash;&nbsp;a sequence of characters delimited by a carriage return. Of course, it can&#8217;t really be that simple, can it? Text files can use several different characters to mark the end of a line. Some use a carriage return character, others use a line feed character, and some use both characters at the end of every line. Python handles all of these cases automatically, so you can say, &#8220;Hey, I want to read this text file one line at a time&#8221; and it will Just Work. 
+-->
+
 <h2 id=write>Writing to Text Files</h2>

 <p>FIXME
@@ -195,7 +195,7 @@ test succeededline 2
 <li>At last, you handle your <code>IOError</code> exception. This could be the <code>IOError</code> exception raised by the call to <code>open</code>, <code>seek</code>, or <code>read</code>. Here, you really don&#8217;t care, because all you&#8217;re going to do is ignore it silently and continue. (Remember, <code>pass</code> is a Python statement that <a href="#fileinfo.class.simplest" title="Example 5.3. The Simplest Python Class">does nothing</a>.)  That&#8217;s perfectly legal; &#8220;handling&#8221; an exception can mean explicitly doing nothing. It still counts as handled, and processing will continue normally on the    next line of code after the <code>try...except</code> block.
 -->

-<h2 id=binary-files>Binary Files</h2>
+<h2 id=binary>Binary Files</h2>

 <p>FIXME

@@ -296,7 +296,7 @@ rules = []

 <p class=d>[<a href=examples/plural5.py>download <code>plural5.py</code></a>]
 <pre class=nd><code class=pp>def rules():
-    with open('plural5-rules.txt') as pattern_file:
+    with open('plural5-rules.txt', encoding='utf-8') as pattern_file:
        for line in pattern_file:
            pattern, search, replace = line.split(None, 3)
            yield build_match_and_apply_functions(pattern, search, replace)
@@ -376,7 +376,7 @@ def plural(noun):
 <p>Let&#8217;s go back to <code>plural5.py</code> and see how this version of the <code>plural()</code> function works.

 <pre><code class=pp>def rules(rules_filename):
-    with open(rules_filename) as pattern_file:
+    with open(rules_filename, encoding='utf-8') as pattern_file:
        for line in pattern_file:
 <a>            pattern, search, replace = line.split(None, 3)                   <span class=u>&#x2460;</span></a>
 <a>            yield build_match_and_apply_functions(pattern, search, replace)  <span class=u>&#x2461;</span></a>
@@ -218,7 +218,7 @@ All three of these class methods, <code>__init__</code>, <code>__iter__</code>,
    rules_filename = 'plural6-rules.txt'

    def __iter__(self):
-        self.pattern_file = open(self.rules_filename)
+        self.pattern_file = open(self.rules_filename, encoding='utf-8')
        self.cache = []
        self.cache_index = 0
        return self
@@ -251,9 +251,9 @@ rules = LazyRules()</code></pre>
 <pre><code class=pp>class LazyRules:
    rules_filename = 'plural6-rules.txt'

-<a>    def __iter__(self):                                <span class=u>&#x2460;</span></a>
-<a>        self.pattern_file = open(self.rules_filename)  <span class=u>&#x2461;</span></a>
-<a>        self.cache = []                                <span class=u>&#x2462;</span></a>
+<a>    def __iter__(self):                                                  <span class=u>&#x2460;</span></a>
+<a>        self.pattern_file = open(self.rules_filename, encoding='utf-8')  <span class=u>&#x2461;</span></a>
+<a>        self.cache = []                                                  <span class=u>&#x2462;</span></a>
        self.cache_index = 0</code></pre>
 <ol>
 <li>The <code>__iter__()</code> method is only going to be called once, after you instantiate the class, assign it to <var>rules</var>, and call <code>iter(rules)</code> to create the iterator.