diff --git a/case-study-porting-chardet-to-python-3.html b/case-study-porting-chardet-to-python-3.html index 715fdbb..1d78229 100644 --- a/case-study-porting-chardet-to-python-3.html +++ b/case-study-porting-chardet-to-python-3.html @@ -3,6 +3,7 @@ Case study: porting chardet to Python 3 - Dive into Python 3 + @@ -42,6 +43,8 @@ body{counter-reset:h1 20}
  • Name 'file' is not defined
  • Can’t use a string pattern on a bytes-like object
  • Can’t convert 'bytes' object to str implicitly +
  • TypeError: unsupported operand type(s) for +: 'int' and 'bytes' +
  • TypeError: ord() expected string of length 1, but int found

    Introducing chardet: a mini-FAQ

    @@ -111,8 +114,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma __version__ = "1.0.1" def detect(aBuf): -- import universaldetector -+ from . import universaldetector +- import universaldetector ++ from . import universaldetector u = universaldetector.UniversalDetector() u.reset() u.feed(aBuf) @@ -122,14 +125,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import Big5DistributionAnalysis --from mbcssm import Big5SMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import Big5DistributionAnalysis -+from .mbcssm import Big5SMModel +-from mbcharsetprober import MultiByteCharSetProber +-from codingstatemachine import CodingStateMachine +-from chardistribution import Big5DistributionAnalysis +-from mbcssm import Big5SMModel ++from .mbcharsetprober import MultiByteCharSetProber ++from .codingstatemachine import CodingStateMachine ++from .chardistribution import Big5DistributionAnalysis ++from .mbcssm import Big5SMModel class Big5Prober(MultiByteCharSetProber): def __init__(self): @@ -139,18 +142,18 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants --from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO --from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO --from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO --from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO --from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO -+from . import constants -+from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO -+from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO -+from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO -+from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO -+from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO +-import constants +-from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO +-from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO +-from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO +-from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO +-from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO ++from . import constants ++from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO ++from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO ++from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO ++from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO ++from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO ENOUGH_DATA_THRESHOLD = 1024 SURE_YES = 0.99 @@ -160,8 +163,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from charsetprober import CharSetProber -+from .charsetprober import CharSetProber +-from charsetprober import CharSetProber ++from .charsetprober import CharSetProber class CharSetGroupProber(CharSetProber): def __init__(self): @@ -171,8 +174,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe +-from constants import eStart, eError, eItsMe ++from .constants import eStart, eError, eItsMe class CodingStateMachine: def __init__(self, sm): @@ -182,28 +185,28 @@ RefactoringTool: Skipping implicit fixer: ws_comma SHORTCUT_THRESHOLD = 0.95 --import __builtin__ -+import builtins +-import __builtin__ ++import builtins if not hasattr(__builtin__, 'False'): False = 0 True = 1 else: -- False = __builtin__.False -- True = __builtin__.True -+ False = builtins.False -+ True = builtins.True +- False = __builtin__.False +- True = __builtin__.True ++ False = builtins.False ++ True = builtins.True --- chardet\escprober.py (original) +++ chardet\escprober.py (refactored) @@ -26,9 +26,9 @@ ######################### END LICENSE BLOCK ######################### import constants, sys --from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel --from charsetprober import CharSetProber --from codingstatemachine import CodingStateMachine -+from .escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel -+from .charsetprober import CharSetProber -+from .codingstatemachine import CodingStateMachine +-from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel +-from charsetprober import CharSetProber +-from codingstatemachine import CodingStateMachine ++from .escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel ++from .charsetprober import CharSetProber ++from .codingstatemachine import CodingStateMachine class EscCharSetProber(CharSetProber): def __init__(self): @@ -213,8 +216,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe +-from constants import eStart, eError, eItsMe ++from .constants import eStart, eError, eItsMe HZ_cls = ( \ 1,0,0,0,0,0,0,0, # 00 - 07 @@ -224,18 +227,18 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from constants import eStart, eError, eItsMe --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import EUCJPDistributionAnalysis --from jpcntx import EUCJPContextAnalysis --from mbcssm import EUCJPSMModel -+from .constants import eStart, eError, eItsMe -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import EUCJPDistributionAnalysis -+from .jpcntx import EUCJPContextAnalysis -+from .mbcssm import EUCJPSMModel +-from constants import eStart, eError, eItsMe +-from mbcharsetprober import MultiByteCharSetProber +-from codingstatemachine import CodingStateMachine +-from chardistribution import EUCJPDistributionAnalysis +-from jpcntx import EUCJPContextAnalysis +-from mbcssm import EUCJPSMModel ++from .constants import eStart, eError, eItsMe ++from .mbcharsetprober import MultiByteCharSetProber ++from .codingstatemachine import CodingStateMachine ++from .chardistribution import EUCJPDistributionAnalysis ++from .jpcntx import EUCJPContextAnalysis ++from .mbcssm import EUCJPSMModel class EUCJPProber(MultiByteCharSetProber): def __init__(self): @@ -245,14 +248,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import EUCKRDistributionAnalysis --from mbcssm import EUCKRSMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import EUCKRDistributionAnalysis -+from .mbcssm import EUCKRSMModel +-from mbcharsetprober import MultiByteCharSetProber +-from codingstatemachine import CodingStateMachine +-from chardistribution import EUCKRDistributionAnalysis +-from mbcssm import EUCKRSMModel ++from .mbcharsetprober import MultiByteCharSetProber ++from .codingstatemachine import CodingStateMachine ++from .chardistribution import EUCKRDistributionAnalysis ++from .mbcssm import EUCKRSMModel class EUCKRProber(MultiByteCharSetProber): def __init__(self): @@ -262,14 +265,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import EUCTWDistributionAnalysis --from mbcssm import EUCTWSMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import EUCTWDistributionAnalysis -+from .mbcssm import EUCTWSMModel +-from mbcharsetprober import MultiByteCharSetProber +-from codingstatemachine import CodingStateMachine +-from chardistribution import EUCTWDistributionAnalysis +-from mbcssm import EUCTWSMModel ++from .mbcharsetprober import MultiByteCharSetProber ++from .codingstatemachine import CodingStateMachine ++from .chardistribution import EUCTWDistributionAnalysis ++from .mbcssm import EUCTWSMModel class EUCTWProber(MultiByteCharSetProber): def __init__(self): @@ -279,14 +282,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import GB2312DistributionAnalysis --from mbcssm import GB2312SMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import GB2312DistributionAnalysis -+from .mbcssm import GB2312SMModel +-from mbcharsetprober import MultiByteCharSetProber +-from codingstatemachine import CodingStateMachine +-from chardistribution import GB2312DistributionAnalysis +-from mbcssm import GB2312SMModel ++from .mbcharsetprober import MultiByteCharSetProber ++from .codingstatemachine import CodingStateMachine ++from .chardistribution import GB2312DistributionAnalysis ++from .mbcssm import GB2312SMModel class GB2312Prober(MultiByteCharSetProber): def __init__(self): @@ -296,10 +299,10 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from charsetprober import CharSetProber --import constants -+from .charsetprober import CharSetProber -+from . import constants +-from charsetprober import CharSetProber +-import constants ++from .charsetprober import CharSetProber ++from . import constants # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -309,8 +312,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants NUM_OF_CATEGORY = 6 DONT_KNOW = -1 @@ -320,8 +323,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants # 255: Control characters that usually does not exist in any text # 254: Carriage/Return @@ -331,8 +334,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants # KOI8-R language model # Character Mapping Table: @@ -342,8 +345,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants # 255: Control characters that usually does not exist in any text # 254: Carriage/Return @@ -353,8 +356,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants # 255: Control characters that usually does not exist in any text # 254: Carriage/Return @@ -364,8 +367,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants # 255: Control characters that usually does not exist in any text # 254: Carriage/Return @@ -375,8 +378,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --import constants -+from . import constants +-import constants ++from . import constants # 255: Control characters that usually does not exist in any text # 254: Carriage/Return @@ -386,10 +389,10 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from charsetprober import CharSetProber --import constants -+from .charsetprober import CharSetProber -+from . import constants +-from charsetprober import CharSetProber +-import constants ++from .charsetprober import CharSetProber ++from . import constants import operator FREQ_CAT_NUM = 4 @@ -399,10 +402,10 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from constants import eStart, eError, eItsMe --from charsetprober import CharSetProber -+from .constants import eStart, eError, eItsMe -+from .charsetprober import CharSetProber +-from constants import eStart, eError, eItsMe +-from charsetprober import CharSetProber ++from .constants import eStart, eError, eItsMe ++from .charsetprober import CharSetProber class MultiByteCharSetProber(CharSetProber): def __init__(self): @@ -412,22 +415,22 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from charsetgroupprober import CharSetGroupProber --from utf8prober import UTF8Prober --from sjisprober import SJISProber --from eucjpprober import EUCJPProber --from gb2312prober import GB2312Prober --from euckrprober import EUCKRProber --from big5prober import Big5Prober --from euctwprober import EUCTWProber -+from .charsetgroupprober import CharSetGroupProber -+from .utf8prober import UTF8Prober -+from .sjisprober import SJISProber -+from .eucjpprober import EUCJPProber -+from .gb2312prober import GB2312Prober -+from .euckrprober import EUCKRProber -+from .big5prober import Big5Prober -+from .euctwprober import EUCTWProber +-from charsetgroupprober import CharSetGroupProber +-from utf8prober import UTF8Prober +-from sjisprober import SJISProber +-from eucjpprober import EUCJPProber +-from gb2312prober import GB2312Prober +-from euckrprober import EUCKRProber +-from big5prober import Big5Prober +-from euctwprober import EUCTWProber ++from .charsetgroupprober import CharSetGroupProber ++from .utf8prober import UTF8Prober ++from .sjisprober import SJISProber ++from .eucjpprober import EUCJPProber ++from .gb2312prober import GB2312Prober ++from .euckrprober import EUCKRProber ++from .big5prober import Big5Prober ++from .euctwprober import EUCTWProber class MBCSGroupProber(CharSetGroupProber): def __init__(self): @@ -437,8 +440,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe +-from constants import eStart, eError, eItsMe ++from .constants import eStart, eError, eItsMe # BIG5 @@ -448,8 +451,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from charsetprober import CharSetProber -+from .charsetprober import CharSetProber +-from charsetprober import CharSetProber ++from .charsetprober import CharSetProber SAMPLE_SIZE = 64 SB_ENOUGH_REL_THRESHOLD = 1024 @@ -459,24 +462,24 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from charsetgroupprober import CharSetGroupProber --from sbcharsetprober import SingleByteCharSetProber --from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model --from langgreekmodel import Latin7GreekModel, Win1253GreekModel --from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel --from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel --from langthaimodel import TIS620ThaiModel --from langhebrewmodel import Win1255HebrewModel --from hebrewprober import HebrewProber -+from .charsetgroupprober import CharSetGroupProber -+from .sbcharsetprober import SingleByteCharSetProber -+from .langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model -+from .langgreekmodel import Latin7GreekModel, Win1253GreekModel -+from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel -+from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel -+from .langthaimodel import TIS620ThaiModel -+from .langhebrewmodel import Win1255HebrewModel -+from .hebrewprober import HebrewProber +-from charsetgroupprober import CharSetGroupProber +-from sbcharsetprober import SingleByteCharSetProber +-from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model +-from langgreekmodel import Latin7GreekModel, Win1253GreekModel +-from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel +-from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel +-from langthaimodel import TIS620ThaiModel +-from langhebrewmodel import Win1255HebrewModel +-from hebrewprober import HebrewProber ++from .charsetgroupprober import CharSetGroupProber ++from .sbcharsetprober import SingleByteCharSetProber ++from .langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model ++from .langgreekmodel import Latin7GreekModel, Win1253GreekModel ++from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel ++from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel ++from .langthaimodel import TIS620ThaiModel ++from .langhebrewmodel import Win1255HebrewModel ++from .hebrewprober import HebrewProber class SBCSGroupProber(CharSetGroupProber): def __init__(self): @@ -486,19 +489,19 @@ RefactoringTool: Skipping implicit fixer: ws_comma # 02110-1301 USA ######################### END LICENSE BLOCK ######################### --from mbcharsetprober import MultiByteCharSetProber --from codingstatemachine import CodingStateMachine --from chardistribution import SJISDistributionAnalysis --from jpcntx import SJISContextAnalysis --from mbcssm import SJISSMModel -+from .mbcharsetprober import MultiByteCharSetProber -+from .codingstatemachine import CodingStateMachine -+from .chardistribution import SJISDistributionAnalysis -+from .jpcntx import SJISContextAnalysis -+from .mbcssm import SJISSMModel +-from mbcharsetprober import MultiByteCharSetProber +-from codingstatemachine import CodingStateMachine +-from chardistribution import SJISDistributionAnalysis +-from jpcntx import SJISContextAnalysis +-from mbcssm import SJISSMModel ++from .mbcharsetprober import MultiByteCharSetProber ++from .codingstatemachine import CodingStateMachine ++from .chardistribution import SJISDistributionAnalysis ++from .jpcntx import SJISContextAnalysis ++from .mbcssm import SJISSMModel import constants, sys --from constants import eStart, eError, eItsMe -+from .constants import eStart, eError, eItsMe +-from constants import eStart, eError, eItsMe ++from .constants import eStart, eError, eItsMe class SJISProber(MultiByteCharSetProber): def __init__(self): @@ -508,14 +511,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from latin1prober import Latin1Prober # windows-1252 --from mbcsgroupprober import MBCSGroupProber # multi-byte character sets --from sbcsgroupprober import SBCSGroupProber # single-byte character sets --from escprober import EscCharSetProber # ISO-2122, etc. -+from .latin1prober import Latin1Prober # windows-1252 -+from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets -+from .sbcsgroupprober import SBCSGroupProber # single-byte character sets -+from .escprober import EscCharSetProber # ISO-2122, etc. +-from latin1prober import Latin1Prober # windows-1252 +-from mbcsgroupprober import MBCSGroupProber # multi-byte character sets +-from sbcsgroupprober import SBCSGroupProber # single-byte character sets +-from escprober import EscCharSetProber # ISO-2122, etc. ++from .latin1prober import Latin1Prober # windows-1252 ++from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets ++from .sbcsgroupprober import SBCSGroupProber # single-byte character sets ++from .escprober import EscCharSetProber # ISO-2122, etc. import re MINIMUM_THRESHOLD = 0.20 @@ -525,14 +528,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma ######################### END LICENSE BLOCK ######################### import constants, sys --from constants import eStart, eError, eItsMe --from charsetprober import CharSetProber --from codingstatemachine import CodingStateMachine --from mbcssm import UTF8SMModel -+from .constants import eStart, eError, eItsMe -+from .charsetprober import CharSetProber -+from .codingstatemachine import CodingStateMachine -+from .mbcssm import UTF8SMModel +-from constants import eStart, eError, eItsMe +-from charsetprober import CharSetProber +-from codingstatemachine import CodingStateMachine +-from mbcssm import UTF8SMModel ++from .constants import eStart, eError, eItsMe ++from .charsetprober import CharSetProber ++from .codingstatemachine import CodingStateMachine ++from .mbcssm import UTF8SMModel ONE_CHAR_PROB = 0.5 @@ -579,8 +582,8 @@ RefactoringTool: Skipping implicit fixer: ws_comma count = 0 u = UniversalDetector() for f in glob.glob(sys.argv[1]): -- print f.ljust(60), -+ print(f.ljust(60), end=' ') +- print f.ljust(60), ++ print(f.ljust(60), end=' ') u.reset() for line in file(f, 'rb'): u.feed(line) @@ -588,14 +591,14 @@ RefactoringTool: Skipping implicit fixer: ws_comma u.close() result = u.result if result['encoding']: -- print result['encoding'], 'with confidence', result['confidence'] -+ print(result['encoding'], 'with confidence', result['confidence']) +- print result['encoding'], 'with confidence', result['confidence'] ++ print(result['encoding'], 'with confidence', result['confidence']) else: -- print '******** no result' -+ print('******** no result') +- print '******** no result' ++ print('******** no result') count += 1 --print count, 'tests' -+print(count, 'tests') +-print count, 'tests' ++print(count, 'tests') RefactoringTool: Files that were modified: RefactoringTool: test.py

    Well, that wasn’t so hard. Just a few imports and print statements to convert. Time to run the new version. Do you think it’ll work? @@ -648,7 +651,7 @@ import sys

    There are variations of this problem scattered throughout the chardet library. In some places it’s "import constants, sys"; in other places, it’s "import constants, re". The fix is the same: manually split the import statement into two lines, one for the relative import, the other for the absolute import.

    Onward!

    Name 'file' is not defined

    -

    FIXME intro +

    And here we go again, running test.py to try to execute our test cases…

    skip over this

    C:\home\chardet> python test.py tests\*\*
     tests\ascii\howto.diveintomark.org.xml
    @@ -661,7 +664,7 @@ NameError: name 'file' is not defined
    for line in open(f, 'rb'):

    And that’s all I have to say about that.

    Can’t use a string pattern on a bytes-like object

    -

    FIXME intro +

    Now things are starting to get interesting. And by “interesting,” I mean “confusing as all hell.”

    skip over this

    C:\home\chardet> python test.py tests\*\*
     tests\ascii\howto.diveintomark.org.xml
    @@ -671,8 +674,8 @@ NameError: name 'file' is not defined
    File "C:\home\chardet\chardet\universaldetector.py", line 98, in feed if self._highBitDetector.search(aBuf): TypeError: can't use a string pattern on a bytes-like object -

    Now things are starting to get interesting. And by “interesting,” I mean “confusing as all hell.” -

    First, let’s see what self._highBitDetector is. It’s defined in the __init__ method of the UniversalDetector class: +

    +

    To debug this, let’s see what self._highBitDetector is. It’s defined in the __init__ method of the UniversalDetector class:

    skip over this

    class UniversalDetector:
         def __init__(self):
    @@ -687,7 +690,7 @@ TypeError: can't use a string pattern on a bytes-like object
    . if self._mInputState == ePureAscii: if self._highBitDetector.search(aBuf): -

    And what is aBuf? Let’s backtrack further to a place that calls UniversalDetector.feed(). One place that calls it is the test harness, test.py. +

    And what is aBuf? Let’s backtrack further to a place that calls UniversalDetector.feed(). One place that calls it is the test harness, test.py.

    skip over this

    u = UniversalDetector()
     .
    @@ -695,7 +698,7 @@ TypeError: can't use a string pattern on a bytes-like object
    . for line in open(f, 'rb'): u.feed(line) -

    And here we find our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for “read”; OK, big deal, we’re reading the file. Ah, but 'b' is for “binary.” Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string -- an array of Unicode characters -- according to the system default character encoding. (You could override the system encoding with another parameter to open(), but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit... characters. But we don’t have characters; we have bytes. Oops. +

    And here we find our answer: in the UniversalDetector.feed() method, aBuf is a line read from a file on disk. Look carefully at the parameters used to open the file: 'rb'. 'r' is for “read”; OK, big deal, we’re reading the file. Ah, but 'b' is for “binary.” Without the 'b' flag, this for loop would read the file, line by line, and convert each line into a string -- an array of Unicode characters -- according to the system default character encoding. (You could override the system encoding with another parameter to open(), but never mind that for now.) But with the 'b' flag, this for loop reads the file, line by line, and stores each line exactly as it appears in the file, as an array of bytes. That byte array gets passed to UniversalDetector.feed(), and eventually gets passed to the pre-compiled regular expression, self._highBitDetector, to search for high-bit... characters. But we don’t have characters; we have bytes. Oops.

    What we need this regular expression to search is not an array of characters, but an array of bytes.

    Once you realize that, the solution is not difficult. Regular expressions defined with strings can search strings. Regular expressions defined with byte arrays can search byte arrays. To define a byte array pattern, we simply change the type of the argument we use to define the regular expression to a byte array. So instead of this:

    self._highBitDetector = re.compile(r'[\x80-\xFF]')
    @@ -716,7 +719,202 @@ for line in open(f, 'rb'): File "C:\home\chardet\chardet\universaldetector.py", line 100, in feed elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): TypeError: Can't convert 'bytes' object to str implicitly -

    ... + +

    There's an unfortunate clash of coding style and Python interpreter here. The TypeError could be anywhere on that line, but the traceback doesn't tell you exactly where it is. It could be in the first conditional or the second, and the traceback would look the same. To narrow it down, you should split the line in half, like this: + +

    skip over this code listing +

    elif (self._mInputState == ePureAscii) and \
    +    self._escDetector.search(self._mLastChar + aBuf):
    + +

    And re-run the test:

    + +

    skip over this command output listing +

    C:\home\chardet> python test.py tests\*\*
    +tests\ascii\howto.diveintomark.org.xml
    +Traceback (most recent call last):
    +  File "test.py", line 10, in <module>
    +    u.feed(line)
    +  File "C:\home\chardet\chardet\universaldetector.py", line 101, in feed
    +    self._escDetector.search(self._mLastChar + aBuf):
    +TypeError: Can't convert 'bytes' object to str implicitly
    + +

    Aha! The problem was not in the first conditional (self._mInputState == ePureAscii) but in the second one. So what could cause a TypeError there? Perhaps you're thinking that the search() method is expecting a value of a different type, but that wouldn't generate this traceback. Python functions can take any value; if you pass the right number of arguments, the function will execute. It may crash if you pass it a value of a different type than it's expecting, but if that happened, the traceback would point to somewhere inside the function. But this traceback says it never got as far as calling the search() method. So the problem must be in that + operation, as it's trying to construct the value that it will eventually pass to the search() method. + +

    We know from previous debugging that aBuf is a byte array. So what is self._mLastChar? It's an instance variable, defined in the reset() method, which is actually called from the __init__() method. + +

    skip over this code listing +

    class UniversalDetector:
    +    def __init__(self):
    +        self._highBitDetector = re.compile(b'[\x80-\xFF]')
    +        self._escDetector = re.compile(b'(\033|~{)')
    +        self._mEscCharSetProber = None
    +        self._mCharSetProbers = []
    +        self.reset()
    +
    +    def reset(self):
    +        self.result = {'encoding': None, 'confidence': 0.0}
    +        self.done = False
    +        self._mStart = True
    +        self._mGotData = False
    +        self._mInputState = ePureAscii
    +        self._mLastChar = ''
    + +

    And now we have our answer. Do you see it? self._mLastChar is a string, but aBuf is a byte array. And you can't concatenate a string to a byte array — not even a zero-length string. + +

    So what is self._mLastChar anyway? The answer is in the feed() method, just a few lines down from where the trackback occurred. + +

    skip over this code listing +

    if self._mInputState == ePureAscii:
    +    if self._highBitDetector.search(aBuf):
    +        self._mInputState = eHighbyte
    +    elif (self._mInputState == ePureAscii) and \
    +            self._escDetector.search(self._mLastChar + aBuf):
    +        self._mInputState = eEscAscii
    +
    +self._mLastChar = aBuf[-1]
    + +

    The calling function calls this feed() method over and over again with a few bytes at a time. The method processes the bytes it was given (passed in as aBuf), then stores the last byte in self._mLastChar in case it's needed during the next call. (In a multi-byte encoding, the feed() method might get called with half of a character, then called again with the other half.) But because aBuf is now a byte array instead of a string, self._mLastChar needs to be a byte array as well. Thus: + +

      def reset(self):
    +      .
    +      .
    +      .
    +-     self._mLastChar = ''
    ++     self._mLastChar = b''
    + +

    TypeError: unsupported operand type(s) for +: 'int' and 'bytes'

    + +

    I have good news, and I have bad news. The good news is we're making progress… + +

    skip over this command listing +

    C:\home\chardet> python test.py tests\*\*
    +tests\ascii\howto.diveintomark.org.xml
    +Traceback (most recent call last):
    +  File "test.py", line 10, in <module>
    +    u.feed(line)
    +  File "C:\home\chardet\chardet\universaldetector.py", line 101, in feed
    +    self._escDetector.search(self._mLastChar + aBuf):
    +TypeError: unsupported operand type(s) for +: 'int' and 'bytes'
    + +

    …The bad news is it doesn't always feel like progress. + +

    But this is progress! Really! Even though the traceback calls out the same line of code, it's a different error than it used to be. Progress! So what's the problem now? The last time I checked, this line of code didn't try to concatenate an int with a byte array (bytes). In fact, you just spent a lot of time ensuring that self._mLastChar was a byte array. How did it turn into an int? + +

    The answer lies not in the previous lines of code, but in the following lines. + +

    skip over this code listing +

    if self._mInputState == ePureAscii:
    +    if self._highBitDetector.search(aBuf):
    +        self._mInputState = eHighbyte
    +    elif (self._mInputState == ePureAscii) and \
    +            self._escDetector.search(self._mLastChar + aBuf):
    +        self._mInputState = eEscAscii
    +
    +self._mLastChar = aBuf[-1]
    + +

    This error doesn't occur the first time the feed() method gets called; it occurs the second time, after self._mLastChar has been set to the last byte of aBuf. Well, what's the problem with that? Getting a single element from a byte array yields an integer, not a byte array. To see the difference, follow me to the interactive shell: + +

    skip over this interpreter listing +

    +>>> aBuf = b'\xEF\xBB\xBF'         
    +>>> len(aBuf)
    +3
    +>>> mLastChar = aBuf[-1]
    +>>> mLastChar                      
    +191
    +>>> type(mLastChar)                
    +<class 'int'>
    +>>> mLastChar + aBuf               
    +Traceback (most recent call last):
    +  File "", line 1, in <module>
    +TypeError: unsupported operand type(s) for +: 'int' and 'bytes'
    +>>> mLastChar = aBuf[-1:]          
    +>>> mLastChar
    +b'\xbf'
    +>>> mLastChar + aBuf               
    +b'\xbf\xef\xbb\xbf'
    +
      +
    1. Define a byte array of 3 bytes. +
    2. The last element of the byte array is 191. +
    3. That's an integer. +
    4. Concatenating an integer with a byte array doesn't work. You've now replicated the error you just found in universaldetector.py. +
    5. Ah, here's the fix. Instead of taking the last element of the byte array, use list slicing to create a new byte array containing just the last element. That is, start with the last element and continue the slice until the end of the byte array. Now mLastChar is a byte array of length 1. +
    6. Concatenating a byte array of length 1 with a byte array of length 3 returns a new byte array of length 4. +
    + +

    So, to ensure that the feed() method in universaldetector.py continues to work no matter how often it's called, you need to initialize self._mLastChar as a 0-length byte array, then make sure it stays a byte array. + +

                  self._escDetector.search(self._mLastChar + aBuf):
    +          self._mInputState = eEscAscii
    +
    +- self._mLastChar = aBuf[-1]
    ++ self._mLastChar = aBuf[-1:]
    + +

    TypeError: ord() expected string of length 1, but int found

    + +

    Tired yet? You're almost there… + +

    skip over this command output listing +

    C:\home\chardet> python test.py tests\*\*
    +tests\ascii\howto.diveintomark.org.xml                       ascii with confidence 1.0
    +tests\Big5\0804.blogspot.com.xml
    +Traceback (most recent call last):
    +  File "test.py", line 10, in <module>
    +    u.feed(line)
    +  File "C:\home\chardet\chardet\universaldetector.py", line 116, in feed
    +    if prober.feed(aBuf) == constants.eFoundIt:
    +  File "C:\home\chardet\chardet\charsetgroupprober.py", line 60, in feed
    +    st = prober.feed(aBuf)
    +  File "C:\home\chardet\chardet\utf8prober.py", line 53, in feed
    +    codingState = self._mCodingSM.next_state(c)
    +  File "C:\home\chardet\chardet\codingstatemachine.py", line 43, in next_state
    +    byteCls = self._mModel['classTable'][ord(c)]
    +TypeError: ord() expected string of length 1, but int found
    + +

    FIXME + +

    skip over this code listing +

    # codingstatemachine.py
    +def next_state(self, c):
    +    # for each byte we get its class
    +    # if it is first byte, we also get byte length
    +    byteCls = self._mModel['classTable'][ord(c)]
    + +

    FIXME [aBuf is a byte array, so c is an int, not a 1-character string. IOW, there's no need to call the ord() function because c is already an int!] + +

    skip over this code listing +

    # utf8prober.py
    +def feed(self, aBuf):
    +    for c in aBuf:
    +        codingState = self._mCodingSM.next_state(c)
    + +

    FIXME [wrapup or deleteme] + +

    TypeError: unorderable types: int() >= str()

    + +

    FIXME [let's go again] + +

    skip over this command output listing +

    C:\home\chardet> python test.py tests\*\*
    +tests\ascii\howto.diveintomark.org.xml                       ascii with confidence 1.0
    +tests\Big5\0804.blogspot.com.xml
    +Traceback (most recent call last):
    +  File "test.py", line 10, in <module>
    +    u.feed(line)
    +  File "C:\home\chardet\chardet\universaldetector.py", line 116, in feed
    +    if prober.feed(aBuf) == constants.eFoundIt:
    +  File "C:\home\chardet\chardet\charsetgroupprober.py", line 60, in feed
    +    st = prober.feed(aBuf)
    +  File "C:\home\chardet\chardet\sjisprober.py", line 68, in feed
    +    self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
    +  File "C:\home\chardet\chardet\jpcntx.py", line 145, in feed
    +    order, charLen = self.get_order(aBuf[i:i+2])
    +  File "C:\home\chardet\chardet\jpcntx.py", line 176, in get_order
    +    if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
    +TypeError: unorderable types: int() >= str()
    + +

    FIXME +

    © 2001–4, 2009 ark Pilgrim, CC-BY-SA-3.0 diff --git a/chardet/chardet/__init__.pyc b/chardet/chardet/__init__.pyc index a991c54..d5024df 100644 Binary files a/chardet/chardet/__init__.pyc and b/chardet/chardet/__init__.pyc differ diff --git a/chardet/chardet/big5freq.pyc b/chardet/chardet/big5freq.pyc index 7552252..6f6a6af 100644 Binary files a/chardet/chardet/big5freq.pyc and b/chardet/chardet/big5freq.pyc differ diff --git a/chardet/chardet/big5prober.pyc b/chardet/chardet/big5prober.pyc index 522ab5d..5cd8f7d 100644 Binary files a/chardet/chardet/big5prober.pyc and b/chardet/chardet/big5prober.pyc differ diff --git a/chardet/chardet/chardistribution.pyc b/chardet/chardet/chardistribution.pyc index 4dc7697..c9ed078 100644 Binary files a/chardet/chardet/chardistribution.pyc and b/chardet/chardet/chardistribution.pyc differ diff --git a/chardet/chardet/charsetgroupprober.pyc b/chardet/chardet/charsetgroupprober.pyc index ed6b38f..9887ef5 100644 Binary files a/chardet/chardet/charsetgroupprober.pyc and b/chardet/chardet/charsetgroupprober.pyc differ diff --git a/chardet/chardet/charsetprober.pyc b/chardet/chardet/charsetprober.pyc index f1bd064..9550a50 100644 Binary files a/chardet/chardet/charsetprober.pyc and b/chardet/chardet/charsetprober.pyc differ diff --git a/chardet/chardet/codingstatemachine.pyc b/chardet/chardet/codingstatemachine.pyc index 9f44854..e02d8bb 100644 Binary files a/chardet/chardet/codingstatemachine.pyc and b/chardet/chardet/codingstatemachine.pyc differ diff --git a/chardet/chardet/constants.pyc b/chardet/chardet/constants.pyc index e7cd4fc..ae83f2c 100644 Binary files a/chardet/chardet/constants.pyc and b/chardet/chardet/constants.pyc differ diff --git a/chardet/chardet/escprober.pyc b/chardet/chardet/escprober.pyc index 5564ff1..fb025d0 100644 Binary files a/chardet/chardet/escprober.pyc and b/chardet/chardet/escprober.pyc differ diff --git a/chardet/chardet/escsm.pyc b/chardet/chardet/escsm.pyc index 1b97ea4..bc66e0b 100644 Binary files a/chardet/chardet/escsm.pyc and b/chardet/chardet/escsm.pyc differ diff --git a/chardet/chardet/eucjpprober.pyc b/chardet/chardet/eucjpprober.pyc index 3810fde..019edc8 100644 Binary files a/chardet/chardet/eucjpprober.pyc and b/chardet/chardet/eucjpprober.pyc differ diff --git a/chardet/chardet/euckrfreq.pyc b/chardet/chardet/euckrfreq.pyc index ba11667..2b7c2c4 100644 Binary files a/chardet/chardet/euckrfreq.pyc and b/chardet/chardet/euckrfreq.pyc differ diff --git a/chardet/chardet/euckrprober.pyc b/chardet/chardet/euckrprober.pyc index d9e33a5..ba8b4ac 100644 Binary files a/chardet/chardet/euckrprober.pyc and b/chardet/chardet/euckrprober.pyc differ diff --git a/chardet/chardet/euctwfreq.pyc b/chardet/chardet/euctwfreq.pyc index ee0826e..e586f56 100644 Binary files a/chardet/chardet/euctwfreq.pyc and b/chardet/chardet/euctwfreq.pyc differ diff --git a/chardet/chardet/euctwprober.pyc b/chardet/chardet/euctwprober.pyc index 2133083..41d576a 100644 Binary files a/chardet/chardet/euctwprober.pyc and b/chardet/chardet/euctwprober.pyc differ diff --git a/chardet/chardet/gb2312freq.pyc b/chardet/chardet/gb2312freq.pyc index c3e5c5a..4df57b1 100644 Binary files a/chardet/chardet/gb2312freq.pyc and b/chardet/chardet/gb2312freq.pyc differ diff --git a/chardet/chardet/gb2312prober.pyc b/chardet/chardet/gb2312prober.pyc index 89356b6..c200bb9 100644 Binary files a/chardet/chardet/gb2312prober.pyc and b/chardet/chardet/gb2312prober.pyc differ diff --git a/chardet/chardet/hebrewprober.pyc b/chardet/chardet/hebrewprober.pyc index 5546ae7..5788cf3 100644 Binary files a/chardet/chardet/hebrewprober.pyc and b/chardet/chardet/hebrewprober.pyc differ diff --git a/chardet/chardet/jisfreq.pyc b/chardet/chardet/jisfreq.pyc index 4259ec3..899285b 100644 Binary files a/chardet/chardet/jisfreq.pyc and b/chardet/chardet/jisfreq.pyc differ diff --git a/chardet/chardet/jpcntx.pyc b/chardet/chardet/jpcntx.pyc index 7adf578..455f6e8 100644 Binary files a/chardet/chardet/jpcntx.pyc and b/chardet/chardet/jpcntx.pyc differ diff --git a/chardet/chardet/langbulgarianmodel.pyc b/chardet/chardet/langbulgarianmodel.pyc index 5fc684c..65b8196 100644 Binary files a/chardet/chardet/langbulgarianmodel.pyc and b/chardet/chardet/langbulgarianmodel.pyc differ diff --git a/chardet/chardet/langcyrillicmodel.pyc b/chardet/chardet/langcyrillicmodel.pyc index 41d0a9f..ad465ee 100644 Binary files a/chardet/chardet/langcyrillicmodel.pyc and b/chardet/chardet/langcyrillicmodel.pyc differ diff --git a/chardet/chardet/langgreekmodel.pyc b/chardet/chardet/langgreekmodel.pyc index 55aa44b..0012470 100644 Binary files a/chardet/chardet/langgreekmodel.pyc and b/chardet/chardet/langgreekmodel.pyc differ diff --git a/chardet/chardet/langhebrewmodel.pyc b/chardet/chardet/langhebrewmodel.pyc index 0b9e814..369dc9e 100644 Binary files a/chardet/chardet/langhebrewmodel.pyc and b/chardet/chardet/langhebrewmodel.pyc differ diff --git a/chardet/chardet/langhungarianmodel.pyc b/chardet/chardet/langhungarianmodel.pyc index b2f02c1..cf25b16 100644 Binary files a/chardet/chardet/langhungarianmodel.pyc and b/chardet/chardet/langhungarianmodel.pyc differ diff --git a/chardet/chardet/langthaimodel.pyc b/chardet/chardet/langthaimodel.pyc index c29e8de..38b86d7 100644 Binary files a/chardet/chardet/langthaimodel.pyc and b/chardet/chardet/langthaimodel.pyc differ diff --git a/chardet/chardet/latin1prober.pyc b/chardet/chardet/latin1prober.pyc index 869b031..4b38eff 100644 Binary files a/chardet/chardet/latin1prober.pyc and b/chardet/chardet/latin1prober.pyc differ diff --git a/chardet/chardet/mbcharsetprober.pyc b/chardet/chardet/mbcharsetprober.pyc index 3b796fa..ba52fba 100644 Binary files a/chardet/chardet/mbcharsetprober.pyc and b/chardet/chardet/mbcharsetprober.pyc differ diff --git a/chardet/chardet/mbcsgroupprober.pyc b/chardet/chardet/mbcsgroupprober.pyc index 8eed604..5f03d2c 100644 Binary files a/chardet/chardet/mbcsgroupprober.pyc and b/chardet/chardet/mbcsgroupprober.pyc differ diff --git a/chardet/chardet/mbcssm.pyc b/chardet/chardet/mbcssm.pyc index 1db05cc..b04c2da 100644 Binary files a/chardet/chardet/mbcssm.pyc and b/chardet/chardet/mbcssm.pyc differ diff --git a/chardet/chardet/sbcharsetprober.pyc b/chardet/chardet/sbcharsetprober.pyc index c8b8672..b0bad59 100644 Binary files a/chardet/chardet/sbcharsetprober.pyc and b/chardet/chardet/sbcharsetprober.pyc differ diff --git a/chardet/chardet/sbcsgroupprober.pyc b/chardet/chardet/sbcsgroupprober.pyc index 857deca..6ec3e06 100644 Binary files a/chardet/chardet/sbcsgroupprober.pyc and b/chardet/chardet/sbcsgroupprober.pyc differ diff --git a/chardet/chardet/sjisprober.pyc b/chardet/chardet/sjisprober.pyc index 6a81164..f5b3a96 100644 Binary files a/chardet/chardet/sjisprober.pyc and b/chardet/chardet/sjisprober.pyc differ diff --git a/chardet/chardet/universaldetector.pyc b/chardet/chardet/universaldetector.pyc index 43c3a05..73669a3 100644 Binary files a/chardet/chardet/universaldetector.pyc and b/chardet/chardet/universaldetector.pyc differ diff --git a/chardet/chardet/utf8prober.pyc b/chardet/chardet/utf8prober.pyc index 9811837..9303ce9 100644 Binary files a/chardet/chardet/utf8prober.pyc and b/chardet/chardet/utf8prober.pyc differ diff --git a/chardet/python3-conversion-notes.txt b/chardet/python3-conversion-notes.txt index 3f7c7e7..5f74a32 100644 --- a/chardet/python3-conversion-notes.txt +++ b/chardet/python3-conversion-notes.txt @@ -10,11 +10,9 @@ import sys * test.py: change file() to open() * universaldetector.py: change r'' strings to b'' byte arrays in self._highBitDetector, self._escDetector regular expressions -- charsetprober.py: change regular expression-based replace to use b'' byte arrays instead of strings -- universaldetector.py: change self._mLastChar from a r'' string to a b'' byte array -- mbcharsetprober.py: change self._mLastChar from a list of two 1-character strings to a list of two ints -- universaldetector.py: getting a single element from a byte array yields an integer, not a byte, so change syntax to make sure we self._mLastChar is always a byte +* universaldetector.py: change self._mLastChar from a '' string to a b'' byte array +* universaldetector.py: getting a single element from a byte array yields an integer, not a byte, so change syntax to make sure we self._mLastChar is always a byte old: self._mLastChar = aBuf[-1] new: @@ -25,4 +23,8 @@ - jpcntx.py, chardistribution.py (editorial): global search-and-replace "aStr" --> "aBuf" to make it clear that we're passing around a byte array - sbcharsetprober.py, latin1prober.py: change ord(c) to c since it's already an int (iterating through a byte array) +- (not sure where this fits) mbcharsetprober.py: change self._mLastChar from a list of two 1-character strings to a list of two ints + +- (not sure where this fits) charsetprober.py: change regular expression-based replace to use b'' byte arrays instead of strings + - latin1prober.py: refactor reduce(operator.add, ...) to use a for loop instead diff --git a/dip2 b/dip2 index 3fa4ea4..4c15027 100644 --- a/dip2 +++ b/dip2 @@ -291,14 +291,14 @@

    The first thing you need to do with Python is install it. Or do you?

    If you're using an account on a hosted server, your ISP may have already installed Python. Most popular Linux distributions come with Python in the default installation. Mac OS X 10.2 and later includes a command-line version of Python, although you'll probably want to install a version that includes a more Mac-like graphical interface.

    Windows does not come with any version of Python, but don't despair! There are several ways to point-and-click your way to Python on Windows. -

    As you can see already, Python runs on a great many operating systems. The full list includes Windows, Mac OS, Mac OS X, and all varieties of free UNIX-compatible systems like Linux. There are also versions that run on Sun Solaris, AS/400, Amiga, OS/2, BeOS, and a plethora +

    As you can see already, Python runs on a great many operating systems. The full list includes Windows, Mac OS, Mac OS X, and all varieties of free UNIX-compatible systems like Linux. There are also versions that run on Sun Solaris, AS/400, Amiga, OS/2, BeOS, and a plethora of other platforms you've probably never even heard of.

    What's more, Python programs written on one platform can, with a little care, run on any supported platform. For instance, I regularly develop Python programs on Windows and later deploy them on Linux.

    So back to the question that started this section, “Which Python is right for you?” The answer is whichever one runs on the computer you already have.

    1.2. Python on Windows

    On Windows, you have a couple choices for installing Python. -

    ActiveState makes a Windows installer for Python called ActivePython, which includes a complete version of Python, an IDE with a Python-aware code editor, plus some Windows extensions for Python that allow complete access to Windows-specific services, APIs, and the Windows Registry. -

    ActivePython is freely downloadable, although it is not open source. It is the IDE I used to learn Python, and I recommend you try it unless you have a specific reason not to. One such reason might be that ActiveState is generally +

    ActiveState makes a Windows installer for Python called ActivePython, which includes a complete version of Python, an IDE with a Python-aware code editor, plus some Windows extensions for Python that allow complete access to Windows-specific services, APIs, and the Windows Registry. +

    ActivePython is freely downloadable, although it is not open source. It is the IDE I used to learn Python, and I recommend you try it unless you have a specific reason not to. One such reason might be that ActiveState is generally several months behind in updating their ActivePython installer when new version of Python are released. If you absolutely need the latest version of Python and ActivePython is still a version behind as you read this, you'll want to use the second option for installing Python on Windows.

    The second option is the “official” Python installer, distributed by the people who develop Python itself. It is freely downloadable and open source, and it is always current with the latest version of Python.

    @@ -368,7 +368,7 @@ IDLE 1.0

    1.3. Python on Mac OS X

    On Mac OS X, you have two choices for installing Python: install it, or don't install it. You probably want to install it.

    Mac OS X 10.2 and later comes with a command-line version of Python preinstalled. If you are comfortable with the command line, you can use this version for the first third of the book. However, -the preinstalled version does not come with an XML parser, so when you get to the XML chapter, you'll need to install the full version. +the preinstalled version does not come with an XML parser, so when you get to the XML chapter, you'll need to install the full version.

    Rather than using the preinstalled version, you'll probably want to install the latest version, which also comes with a graphical interactive shell.

    @@ -426,7 +426,7 @@ Type "help", "copyright", "credits", or "license" for more information.

    Double-click PythonIDE to launch Python. -

    The MacPython IDE should display a splash screen, then take you to the interactive shell. If the interactive shell does not appear, select +

    The MacPython IDE should display a splash screen, then take you to the interactive shell. If the interactive shell does not appear, select Window->Python Interactive (Cmd-0). The opening window will look something like this:

     Python 2.3 (#2, Jul 30 2003, 11:45:28)
    @@ -475,7 +475,7 @@ Type "help", "copyright", "credits", or "license" for more information.
     

    Double-click Python IDE to launch Python. -

    The MacPython IDE should display a splash screen, and then take you to the interactive shell. If the interactive shell does not appear, select +

    The MacPython IDE should display a splash screen, and then take you to the interactive shell. If the interactive shell does not appear, select Window->Python Interactive (Cmd-0). You'll see a screen like this:

     Python 2.3 (#2, Jul 30 2003, 11:45:28)
    @@ -486,7 +486,7 @@ MacPython IDE 1.0.1
     

    1.5. Python on RedHat Linux

    Installing under UNIX-compatible operating systems such as Linux is easy if you're willing to install a binary package. Pre-built binary packages are available for most popular Linux distributions. Or you can always compile from source. -

    Download the latest Python RPM by going to http://www.python.org/ftp/python/ and selecting the highest version number listed, then selecting the rpms/ directory within that. Then download the RPM with the highest version number. You can install it with the rpm command, as shown here: +

    Download the latest Python RPM by going to http://www.python.org/ftp/python/ and selecting the highest version number listed, then selecting the rpms/ directory within that. Then download the RPM with the highest version number. You can install it with the rpm command, as shown here:

    Example 1.2. Installing on RedHat Linux 9

     localhost:~$ su -
     Password: [enter your root password]
    @@ -516,9 +516,9 @@ Type "help", "copyright", "credits", or "license" for more information.
     
  • Whoops! Just typing python gives you the older version of Python -- the one that was installed by default. That's not the one you want.
  • At the time of this writing, the newest version is called python2.3. You'll probably want to change the path on the first line of the sample scripts to point to the newer version.
  • This is the complete path of the newer version of Python that you just installed. Use this on the #! line (the first line of each script) to ensure that scripts are running under the latest version of Python, and be sure to type python2.3 to get into the interactive shell. -

    1.6. Python on Debian GNU/Linux

    -

    If you are lucky enough to be running Debian GNU/Linux, you install Python through the apt command. -

    Example 1.3. Installing on Debian GNU/Linux

    +

    1.6. Python on Debian GNU/Linux

    +

    If you are lucky enough to be running Debian GNU/Linux, you install Python through the apt command. +

    Example 1.3. Installing on Debian GNU/Linux

     localhost:~$ su -
     Password: [enter your root password]
     localhost:~# apt-get install python
    @@ -640,16 +640,16 @@ if __name__ == "__main__":
         print buildConnectionString(myParams)

    Now run this program and see what happens. -
    TipIn the ActivePython IDE on Windows, you can run the Python program you're editing by choosing +TipIn the ActivePython IDE on Windows, you can run the Python program you're editing by choosing File->Run... (Ctrl-R). Output is displayed in the interactive window. -
    TipIn the Python IDE on Mac OS, you can run a Python program with -Python->Run window... (Cmd-R), but there is an important option you must set first. Open the .py file in the IDE, pop up the options menu by clicking the black triangle in the upper-right corner of the window, and make sure the Run as __main__ option is checked. This is a per-file setting, but you'll only need to do it once per file. +TipIn the Python IDE on Mac OS, you can run a Python program with +Python->Run window... (Cmd-R), but there is an important option you must set first. Open the .py file in the IDE, pop up the options menu by clicking the black triangle in the upper-right corner of the window, and make sure the Run as __main__ option is checked. This is a per-file setting, but you'll only need to do it once per file. -
    TipOn UNIX-compatible systems (including Mac OS X), you can run a Python program from the command line: python odbchelper.py

    The id="odbchelper.output" output of odbchelper.py will look like this:

    server=mpilgrim;uid=sa;database=master;pwd=secret

    2.2. Declaring Functions

    -

    Python has functions like most other languages, but it does not have separate header files like C++ or interface/implementation sections like Pascal. When you need a function, just declare it, like this: +

    TipOn UNIX-compatible systems (including Mac OS X), you can run a Python program from the command line: python odbchelper.py

    The id="odbchelper.output" output of odbchelper.py will look like this:

    server=mpilgrim;uid=sa;database=master;pwd=secret

    2.2. Declaring Functions

    +

    Python has functions like most other languages, but it does not have separate header files like C++ or interface/implementation sections like Pascal. When you need a function, just declare it, like this:

    
     def buildConnectionString(params):

    Note that the keyword def starts the function declaration, followed by the function name, followed by the arguments in parentheses. Multiple arguments (not shown here) are separated with commas. @@ -661,7 +661,7 @@ In fact, every Python function returns a value; if the function ever executes a

    The argument, params, doesn't specify a datatype. In Python, variables are never explicitly typed. Python figures out what type a variable is and keeps track of it internally. -
    NoteIn Java, C++, and other statically-typed languages, you must specify the datatype of the function return value and each function argument. +NoteIn Java, C++, and other statically-typed languages, you must specify the datatype of the function return value and each function argument. In Python, you never explicitly specify the datatype of anything. Based on what value you assign, Python keeps track of the datatype internally.

    2.2.1. How Python's Datatypes Compare to Other Programming Languages

    An erudite reader sent me this explanation of how Python compares to other programming languages: @@ -669,7 +669,7 @@ In fact, every Python function returns a value; if the function ever executes a

    statically typed language
    A language in which types are fixed at compile time. Most statically typed languages enforce this by requiring you to declare - all variables with their datatypes before using them. Java and C are statically typed languages. + all variables with their datatypes before using them. Java and C are statically typed languages.
    dynamically typed language
    A language in which types are discovered at execution time; the opposite of statically typed. VBScript and Python are dynamically typed, because they figure out what type a variable is when you first assign it a value. @@ -698,7 +698,7 @@ def buildConnectionString(params): need to give your function a docstring, but you always should. I know you've heard this in every programming class you've ever taken, but Python gives you an added incentive: the docstring is available at runtime as an attribute of the function. -
    NoteMany Python IDEs use the docstring to provide context-sensitive documentation, so that when you type a function name, its docstring appears as a tooltip. This can be incredibly helpful, but it's only as good as the docstrings you write. +NoteMany Python IDEs use the docstring to provide context-sensitive documentation, so that when you type a function name, its docstring appears as a tooltip. This can be incredibly helpful, but it's only as good as the docstrings you write. @@ -712,7 +712,7 @@ need to give your function a docstring, but you always should. I kn if __name__ == "__main__":

    Some quick observations before you get to the good stuff. First, parentheses are not required around the if expression. Second, the if statement ends with a colon, and is followed by indented code. -
    NoteLike C, Python uses == for comparison and = for assignment. Unlike C, Python does not support in-line assignment, so there's no chance of accidentally assigning the value you thought you were comparing. +NoteLike C, Python uses == for comparison and = for assignment. Unlike C, Python does not support in-line assignment, so there's no chance of accidentally assigning the value you thought you were comparing.

    So why is this particular if statement a trick? Modules are objects, and all modules have a built-in attribute __name__. A module's __name__ depends on how you're using the module. If you import the module, then __name__ is the module's filename, without a directory path or file extension. But you can also run the module directly as a standalone program, in which case __name__ will be a special default value, __main__.

    >>> import odbchelper
    @@ -746,7 +746,7 @@ if __name__ == "__main__":
     

    Also notice that the variable assignment is one command split over several lines, with a backslash (“\”) serving as a line-continuation marker. -
    NoteWhen a command is split among several lines with the line-continuation marker (“\”), the continued lines can be indented in any manner; Python's normally stringent indentation rules do not apply. If your Python IDE auto-indents the continued line, you should probably accept its default unless you have a burning reason not to. +NoteWhen a command is split among several lines with the line-continuation marker (“\”), the continued lines can be indented in any manner; Python's normally stringent indentation rules do not apply. If your Python IDE auto-indents the continued line, you should probably accept its default unless you have a burning reason not to.

    Strictly speaking, expressions in parentheses, straight brackets, or curly braces (like defining a dictionary) can be split into multiple lines with or without the line continuation character (“\”). I like to include the backslash even when it's not required because I think it makes the code easier to read, but that's a matter of style.

    Third, you never declared the variable myParams, you just assigned a value to it. This is like VBScript without the option explicit option. Luckily, unlike VBScript, Python will not allow you to reference a variable that has never been assigned a value; trying to do so will raise an exception. @@ -770,7 +770,7 @@ NameError: There is no variable named 'x' 'e'

    1. v is a tuple of three elements, and (x, y, z) is a tuple of three variables. Assigning one to the other assigns each of the values of v to each of the variables, in order. -

      This has all sorts of uses. I often want to assign names to a range of values. In C, you would use enum and manually list each constant and its associated value, which seems especially tedious when the values are consecutive. +

      This has all sorts of uses. I often want to assign names to a range of values. In C, you would use enum and manually list each constant and its associated value, which seems especially tedious when the values are consecutive. In Python, you can use the built-in range function with multi-variable assignment to quickly assign consecutive values.

      Example 3.20. Assigning Consecutive Values

      >>> range(7)              
       [0, 1, 2, 3, 4, 5, 6]
      @@ -784,7 +784,7 @@ NameError: There is no variable named 'x'
       
      1. The built-in range function returns a list of integers. In its simplest form, it takes an upper limit and returns a zero-based list counting up to but not including the upper limit. (If you like, you can pass other parameters to specify a base other than 0 and a step other than 1. You can print range.__doc__ for details.) -
      2. MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY, and SUNDAY are the variables you're defining. (This example came from the calendar module, a fun little module that prints calendars, like the UNIX program cal. The calendar module defines integer constants for days of the week.) +
      3. MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY, and SUNDAY are the variables you're defining. (This example came from the calendar module, a fun little module that prints calendars, like the UNIX program cal. The calendar module defines integer constants for days of the week.)
      4. Now each variable has its value: MONDAY is 0, TUESDAY is 1, and so forth.

        You can also use multi-variable assignment to build functions that return multiple values, simply by returning a tuple of all the values. The caller can treat it as a tuple, or assign the values to individual variables. Many standard Python libraries do this, including the os module, which you'll discuss in Chapter 6. @@ -801,7 +801,7 @@ NameError: There is no variable named 'x' to insert values into a string with the %s placeholder. -
        NoteString formatting in Python uses the same syntax as the sprintf function in C. +NoteString formatting in Python uses the same syntax as the sprintf function in C.

        Example 3.21. Introducing String Formatting

        >>> k = "uid"
         >>> v = "sa"
         >>> "%s=%s" % (k, v) 
        @@ -833,7 +833,7 @@ TypeError: cannot concatenate 'str' and 'int' objects
        String formatting works with integers by specifying %d instead of %s.
      5. Trying to concatenate a string with a non-string raises an exception. Unlike string formatting, string concatenation works only when everything is already a string. -

        As with printf in C, string formatting in Python is like a Swiss Army knife. There are options galore, and modifier strings to specially format many different types of values. +

        As with printf in C, string formatting in Python is like a Swiss Army knife. There are options galore, and modifier strings to specially format many different types of values.

        Example 3.23. Formatting Numbers

         >>> print "Today's stock price: %f" % 50.4625   
         50.462500
        @@ -852,7 +852,7 @@ TypeError: cannot concatenate 'str' and 'int' objects
      6. Python Library Reference summarizes all the string formatting format characters. -
      7. Effective AWK Programming discusses all the format characters and advanced string formatting techniques like specifying width, precision, and zero-padding. +
      8. Effective AWK Programming discusses all the format characters and advanced string formatting techniques like specifying width, precision, and zero-padding.

        3.6. Mapping Lists

        @@ -953,7 +953,7 @@ called split.
      9. Python Library Reference documents the string module. -
      10. The Whole Python FAQ explains why join is a string method instead of a list method. +
      11. The Whole Python FAQ explains why join is a string method instead of a list method.

        3.7.1. Historical Note on String Methods

        @@ -981,9 +981,9 @@ if __name__ == "__main__":

        Before diving into the next chapter, make sure you're comfortable doing all of these things:

          -
        • Using the Python IDE to test expressions interactively +
        • Using the Python IDE to test expressions interactively -
        • Writing Python programs and running them from within your IDE, or from the command line +
        • Writing Python programs and running them from within your IDE, or from the command line
        • Importing modules and calling their functions @@ -1029,7 +1029,7 @@ if __name__ == "__main__": The if __name__ trick allows this program do something useful when run by itself, without interfering with its use as a module for other programs. In this case, the program simply prints out the docstring of the info function.
        • if statements use == for comparison, and parentheses are not required. -

          The info function is designed to be used by you, the programmer, while working in the Python IDE. It takes any object that has functions or methods (like a module, which has functions, or a list, which has methods) and +

          The info function is designed to be used by you, the programmer, while working in the Python IDE. It takes any object that has functions or methods (like a module, which has functions, or a list, which has methods) and prints out the functions and their docstrings.

          Example 4.2. Sample Usage of apihelper.py

          >>> from apihelper import info
           >>> li = []
          @@ -1054,7 +1054,7 @@ buildConnectionString          Build a connection string from a dictionary Retur
               Returns string.
           

          4.2. Using Optional and Named Arguments

          Python allows function arguments to have default values; if the function is called without the argument, the argument gets its default - value. Futhermore, arguments can be specified in any order by using named arguments. Stored procedures in SQL Server Transact/SQL can do this, so if you're a SQL Server scripting guru, you can skim this part. + value. Futhermore, arguments can be specified in any order by using named arguments. Stored procedures in SQL Server Transact/SQL can do this, so if you're a SQL Server scripting guru, you can skim this part.

          Here is an example of info, a function with two optional arguments:

          
           def info(object, spacing=10, collapse=1):

          spacing and collapse are optional, because they have default values defined. object is required, because it has no default value. If info is called with only one argument, spacing defaults to 10 and collapse defaults to 1. If info is called with two arguments, collapse still defaults to 1. @@ -1247,7 +1247,7 @@ True

          4.4.2. getattr As a Dispatcher

          A common usage pattern of getattr is as a dispatcher. For example, if you had a program that could output data in a variety of different formats, you could define separate functions for each output format and use a single dispatch function to call the right one. -

          For example, let's imagine a program that prints site statistics in HTML, XML, and plain text formats. The choice of output format could be specified on the command line, or stored in a configuration +

          For example, let's imagine a program that prints site statistics in HTML, XML, and plain text formats. The choice of output format could be specified on the command line, or stored in a configuration file. A statsout module defines three functions, output_html, output_xml, and output_text. Then the main program defines a single output function, like this:

          Example 4.12. Creating a Dispatcher with getattr

          
           import statsout
          @@ -1345,7 +1345,7 @@ the pop method of a list) and user-defined (like the buildCon
           
        • If all values are false, or returns the last value. or evaluates '', which is false, then [], which is false, then {}, which is false, and returns {}.
        • Note that or evaluates values only until it finds one that is true in a boolean context, and then it ignores the rest. This distinction is important if some values can have side effects. Here, the function sidefx is never called, because or evaluates 'a', which is true, and returns 'a' immediately. -

          If you're a C hacker, you are certainly familiar with the bool ? a : b expression, which evaluates to a if bool is true, and b otherwise. Because of the way and and or work in Python, you can accomplish the same thing. +

          If you're a C hacker, you are certainly familiar with the bool ? a : b expression, which evaluates to a if bool is true, and b otherwise. Because of the way and and or work in Python, you can accomplish the same thing.

          4.6.1. Using the and-or Trick

          Example 4.17. Introducing the and-or Trick

          >>> a = "first"
           >>> b = "second"
          @@ -1355,17 +1355,17 @@ the pop method of a list) and user-defined (like the buildCon
           'second'
           
            -
          1. This syntax looks similar to the bool ? a : b expression in C. The entire expression is evaluated from left to right, so the and is evaluated first. 1 and 'first' evalutes to 'first', then 'first' or 'second' evalutes to 'first'. +
          2. This syntax looks similar to the bool ? a : b expression in C. The entire expression is evaluated from left to right, so the and is evaluated first. 1 and 'first' evalutes to 'first', then 'first' or 'second' evalutes to 'first'.
          3. 0 and 'first' evalutes to False, and then 0 or 'second' evaluates to 'second'.

            However, since this Python expression is simply boolean logic, and not a special construct of the language, there is one extremely important difference - between this and-or trick in Python and the bool ? a : b syntax in C. If the value of a is false, the expression will not work as you would expect it to. (Can you tell I was bitten by this? More than once?) + between this and-or trick in Python and the bool ? a : b syntax in C. If the value of a is false, the expression will not work as you would expect it to. (Can you tell I was bitten by this? More than once?)

            Example 4.18. When the and-or Trick Fails

            >>> a = ""
             >>> b = "second"
             >>> 1 and a or b         
             'second'
            1. Since a is an empty string, which Python considers false in a boolean context, 1 and '' evalutes to '', and then '' or 'second' evalutes to 'second'. Oops! That's not what you wanted. -

              The and-or trick, bool and a or b, will not work like the C expression bool ? a : b when a is false in a boolean context. +

              The and-or trick, bool and a or b, will not work like the C expression bool ? a : b when a is false in a boolean context.

              The real trick behind the and-or trick, then, is to make sure that the value of a is never false. One common way of doing this is to turn a into [a] and b into [b], then taking the first element of the returned list, which will be either a or b.

              Example 4.19. Using the and-or Trick Safely

              >>> a = ""
               >>> b = "second"
              @@ -1436,9 +1436,9 @@ a	test
               
               

              4.8. Putting It All Together

              @@ -1479,12 +1479,12 @@ True 'None'
                -
              1. You can easily define a function that has no docstring, so its __doc__ attribute is None. Confusingly, if you evaluate the __doc__ attribute directly, the Python IDE prints nothing at all, which makes sense if you think about it, but is still unhelpful. +
              2. You can easily define a function that has no docstring, so its __doc__ attribute is None. Confusingly, if you evaluate the __doc__ attribute directly, the Python IDE prints nothing at all, which makes sense if you think about it, but is still unhelpful.
              3. You can verify that the value of the __doc__ attribute is actually None by comparing it directly.
              4. The str function takes the null value and returns a string representation of it, 'None'. -
                NoteIn SQL, you must use IS NULL instead of = NULL to compare a null value. In Python, you can use either == None or is None, but is None is faster. +NoteIn SQL, you must use IS NULL instead of = NULL to compare a null value. In Python, you can use either == None or is None, but is None is faster.

                Now that you are guaranteed to have a string, you can pass the string to processFunc, which you have already defined as a function that either does or doesn't collapse whitespace. Now you see why it was important to use str to convert a None value into a string representation. processFunc is assuming a string argument and calling its split method, which would crash if you passed it None because None doesn't have a split method.

                Stepping back even further, you see that you're using string formatting again to concatenate the return value of processFunc with the return value of method's ljust method. This is a new string method that you haven't seen before.

                Example 4.24. Introducing ljust

                >>> s = 'buildConnectionString'
                @@ -1703,7 +1703,7 @@ can import individual items or use from module import *
                 
                Notefrom module import * in Python is like import module.* in Java; import module in Python is like import module in Java. -

                Example 5.2. import module vs. from module import

                >>> import types
                +

                Example 5.2. import module vs. from module import

                >>> import types
                 >>> types.FunctionType             
                 <type 'function'>
                 >>> FunctionType 
                @@ -1736,7 +1736,7 @@ NameError: There is no variable named 'FunctionType'
                 

                Further Reading on Module Importing Techniques

                  -
                • eff-bot has more to say on import module vs. from module import. +
                • eff-bot has more to say on import module vs. from module import.
                • Python Tutorial discusses advanced import techniques, including from module import *. @@ -1756,10 +1756,10 @@ class Loaf:
                • You probably guessed this, but everything in a class is indented, just like the code within a function, if statement, for loop, and so forth. The first thing not indented is not in the class. -
                  NoteThe pass statement in Python is like an empty set of braces ({}) in Java or C. +NoteThe pass statement in Python is like an empty set of braces ({}) in Java or C.

                  Of course, realistically, most classes will be inherited from other classes, and they will define their own class methods and attributes. But as you've just seen, there is nothing that a class absolutely must have, other than a name. In particular, -C++ programmers may find it odd that Python classes don't have explicit constructors and destructors. Python classes do have something similar to a constructor: the __init__ method. +C++ programmers may find it odd that Python classes don't have explicit constructors and destructors. Python classes do have something similar to a constructor: the __init__ method.

                  Example 5.4. Defining the FileInfo Class

                  
                   from UserDict import UserDict
                   
                  @@ -1791,7 +1791,7 @@ class FileInfo(UserDict):
                                  them optional to the caller. In this case, filename has a default value of None, which is the Python null value.
                   
                   
                  -
                  NoteBy convention, the first argument of any Python class method (the reference to the current instance) is called self. This argument fills the role of the reserved word this in C++ or Java, but self is not a reserved word in Python, merely a naming convention. Nonetheless, please don't call it anything but self; this is a very strong convention. +NoteBy convention, the first argument of any Python class method (the reference to the current instance) is called self. This argument fills the role of the reserved word this in C++ or Java, but self is not a reserved word in Python, merely a naming convention. Nonetheless, please don't call it anything but self; this is a very strong convention.

                  Example 5.6. Coding the FileInfo Class

                  
                   class FileInfo(UserDict):
                       "store file metadata"
                  @@ -1844,7 +1844,7 @@ class FileInfo(UserDict):
                   
                • Remember when the __init__ method assigned its filename argument to self["name"]? Well, here's the result. The arguments you pass when you create the class instance get sent right along to the __init__ method (along with the object reference, self, which Python adds for free). -
                  NoteIn Python, simply call a class as if it were a function to create a new instance of the class. There is no explicit new operator like C++ or Java. +NoteIn Python, simply call a class as if it were a function to create a new instance of the class. There is no explicit new operator like C++ or Java.

                  5.4.1. Garbage Collection

                  If creating new instances is easy, destroying them is even easier. In general, there is no need to explicitly free instances, because they are freed automatically when the variables assigned to them go out of scope. Memory leaks are rare in Python. @@ -1874,7 +1874,7 @@ class FileInfo(UserDict):

                  As you've seen, FileInfo is a class that acts like a dictionary. To explore this further, let's look at the UserDict class in the UserDict module, which is the ancestor of the FileInfo class. This is nothing special; the class is written in Python and stored in a .py file, just like any other Python code. In particular, it's stored in the lib directory in your Python installation. -
                  TipIn the ActivePython IDE on Windows, you can quickly open any module in your library path by selecting +TipIn the ActivePython IDE on Windows, you can quickly open any module in your library path by selecting File->Locate... (Ctrl-L).

                  Example 5.9. Defining the UserDict Class

                  
                   class UserDict:              
                  @@ -1887,24 +1887,24 @@ class UserDict:              
                   
                • This is the __init__ method that you overrode in the FileInfo class. Note that the argument list in this ancestor class is different than the descendant. That's okay; each subclass can have its own set of arguments, as long as it calls the ancestor with the correct arguments. Here the ancestor class has a way to define initial values (by passing a dictionary in the dict argument) which the FileInfo does not use. -
                • Python supports data attributes (called “instance variables” in Java and Powerbuilder, and “member variables” in C++). Data attributes are pieces of data held by a specific instance of a class. In this case, each instance of UserDict will have a data attribute data. To reference this attribute from code outside the class, you qualify it with the instance name, instance.data, in the same way that you qualify a function with its module name. To reference a data attribute from within the class, +
                • Python supports data attributes (called “instance variables” in Java and Powerbuilder, and “member variables” in C++). Data attributes are pieces of data held by a specific instance of a class. In this case, each instance of UserDict will have a data attribute data. To reference this attribute from code outside the class, you qualify it with the instance name, instance.data, in the same way that you qualify a function with its module name. To reference a data attribute from within the class, you use self as the qualifier. By convention, all data attributes are initialized to reasonable values in the __init__ method. However, this is not required, since data attributes, like local variables, spring into existence when they are first assigned a value.
                • The update method is a dictionary duplicator: it copies all the keys and values from one dictionary to another. This does not clear the target dictionary first; if the target dictionary already has some keys, the ones from the source dictionary will be overwritten, but others will be left untouched. Think of update as a merge function, not a copy function.
                • This is a syntax you may not have seen before (I haven't used it in the examples in this book). It's an if statement, but instead of having an indented block starting on the next line, there is just a single statement on the same line, after the colon. This is perfectly legal syntax, which is just a shortcut you can use when you have only one statement - in a block. (It's like specifying a single statement without braces in C++.) You can use this syntax, or you can have indented code on subsequent lines, but you can't do both for the same block. + in a block. (It's like specifying a single statement without braces in C++.) You can use this syntax, or you can have indented code on subsequent lines, but you can't do both for the same block. -
                  NoteJava and Powerbuilder support function overloading by argument list, i.e. one class can have multiple methods with the same name but a different number of arguments, or arguments of different types. - Other languages (most notably PL/SQL) even support function overloading by argument name; i.e. one class can have multiple methods with the same name and the same number of arguments of the same type but different argument +NoteJava and Powerbuilder support function overloading by argument list, i.e. one class can have multiple methods with the same name but a different number of arguments, or arguments of different types. + Other languages (most notably PL/SQL) even support function overloading by argument name; i.e. one class can have multiple methods with the same name and the same number of arguments of the same type but different argument names. Python supports neither of these; it has no form of function overloading whatsoever. Methods are defined solely by their name, and there can be only one method per class with a given name. So if a descendant class has an __init__ method, it always overrides the ancestor __init__ method, even if the descendant defines it with a different argument list. And the same rule applies to any other method.
                  NoteGuido, the original author of Python, explains method overriding this way: "Derived classes may override methods of their base classes. Because methods have no special privileges when calling other methods of the same object, a method of a base class that calls another method defined - in the same base class, may in fact end up calling a method of a derived class that overrides it. (For C++ programmers: all methods in Python are effectively virtual.)" If that doesn't make sense to you (it confuses the hell out of me), feel free to ignore it. + in the same base class, may in fact end up calling a method of a derived class that overrides it. (For C++ programmers: all methods in Python are effectively virtual.)" If that doesn't make sense to you (it confuses the hell out of me), feel free to ignore it. I just thought I'd pass it along. @@ -2190,7 +2190,7 @@ AttributeError: 'MP3FileInfo' instance has no attribute '__parse'Like many other programming languages, Python has exception handling via try...except blocks.
                  -
                  NotePython uses try...except to handle exceptions and raise to generate them. Java and C++ use try...catch to handle exceptions, and throw to generate them. +NotePython uses try...except to handle exceptions and raise to generate them. Java and C++ use try...catch to handle exceptions, and throw to generate them.

                  Exceptions are everywhere in Python. Virtually every module in the standard Python library uses them, and Python itself will raise them in a lot of different circumstances. You've already seen them repeatedly throughout this book.

                  -

                  In each of these cases, you were simply playing around in the Python IDE: an error occurred, the exception was printed (depending on your IDE, perhaps in an intentionally jarring shade of red), and that was that. This is called an unhandled exception. When the exception was raised, there was no code to explicitly notice it and deal with it, so it bubbled its -way back to the default behavior built in to Python, which is to spit out some debugging information and give up. In the IDE, that's no big deal, but if that happened while your actual Python program was running, the entire program would come to a screeching halt. +

                  In each of these cases, you were simply playing around in the Python IDE: an error occurred, the exception was printed (depending on your IDE, perhaps in an intentionally jarring shade of red), and that was that. This is called an unhandled exception. When the exception was raised, there was no code to explicitly notice it and deal with it, so it bubbled its +way back to the default behavior built in to Python, which is to spit out some debugging information and give up. In the IDE, that's no big deal, but if that happened while your actual Python program was running, the entire program would come to a screeching halt.

                  An exception doesn't need result in a complete program crash, though. Exceptions, when raised, can be handled. Sometimes an exception is really because you have a bug in your code (like accessing a variable that doesn't exist), but many times, an exception is something you can anticipate. If you're opening a file, it might not exist. If you're connecting to a database, it might be unavailable, or you might not have the correct security credentials to access it. If you know @@ -2239,7 +2239,7 @@ exceptions, errors occur immediately, and you can handle them in a standard way or to support multiple platforms (where platform-specific code is separated into different modules).

                  You can also define your own exceptions by creating a class that inherits from the built-in Exception class, and then raise your exceptions with the raise command. See the further reading section if you're interested in doing this.

                  The next example demonstrates how to use an exception to support platform-specific functionality. This code comes from the -getpass module, a wrapper module for getting a password from the user. Getting a password is accomplished differently on UNIX, Windows, and Mac OS platforms, but this code encapsulates all of those differences. +getpass module, a wrapper module for getting a password from the user. Getting a password is accomplished differently on UNIX, Windows, and Mac OS platforms, but this code encapsulates all of those differences.

                  Example 6.2. Supporting Platform-Specific Functionality

                  
                     # Bind the name getpass to the appropriate function
                     try:
                  @@ -2259,9 +2259,9 @@ exceptions, errors occur immediately, and you can handle them in a standard way
                     else:
                         getpass = unix_getpass
                    -
                  1. termios is a UNIX-specific module that provides low-level control over the input terminal. If this module is not available (because it's not +
                  2. termios is a UNIX-specific module that provides low-level control over the input terminal. If this module is not available (because it's not on your system, or your system doesn't support it), the import fails and Python raises an ImportError, which you catch. -
                  3. OK, you didn't have termios, so let's try msvcrt, which is a Windows-specific module that provides an API to many useful functions in the Microsoft Visual C++ runtime services. If this import fails, Python will raise an ImportError, which you catch. +
                  4. OK, you didn't have termios, so let's try msvcrt, which is a Windows-specific module that provides an API to many useful functions in the Microsoft Visual C++ runtime services. If this import fails, Python will raise an ImportError, which you catch.
                  5. If the first two didn't work, you try to import a function from EasyDialogs, which is a Mac OS-specific module that provides functions to pop up dialog boxes of various types. Once again, if this import fails, Python will raise an ImportError, which you catch.
                  6. None of these platform-specific modules is available (which is possible, since Python has been ported to a lot of different platforms), so you need to fall back on a default password input function (which is defined elsewhere in the getpass module). Notice what you're doing here: assigning the function default_getpass to the variable getpass. If you read the official getpass documentation, it tells you that the getpass module defines a getpass function. It does this by binding getpass to the correct function for your platform. Then when you call the getpass function, you're really calling a platform-specific function that this code has set up for you. You don't need to know or @@ -2358,7 +2358,7 @@ ValueError: I/O operation on closed file
                  7. Just because a file is closed doesn't mean that the file object ceases to exist. The variable f will continue to exist until it goes out of scope or gets manually deleted. However, none of the methods that manipulate an open file will work once the file has been closed; they all raise an exception.
                  8. Calling close on a file object whose file is already closed does not raise an exception; it fails silently. -

                    6.2.3. Handling I/O Errors

                    +

                    6.2.3. Handling I/O Errors

                    Now you've seen enough to understand the file handling code in the fileinfo.py sample code from teh previous chapter. This example shows how to safely open and read from a file and gracefully handle errors.

                    Example 6.6. File Objects in MP3FileInfo

                    
                    @@ -2486,7 +2486,7 @@ USERNAME=mpilgrim
                     [...snip...]
                    1. os.environ is a dictionary of the environment variables defined on your system. In Windows, these are your user and system variables - accessible from MS-DOS. In UNIX, they are the variables exported in your shell's startup scripts. In Mac OS, there is no concept of environment variables, so this dictionary is empty. + accessible from MS-DOS. In UNIX, they are the variables exported in your shell's startup scripts. In Mac OS, there is no concept of environment variables, so this dictionary is empty.
                    2. os.environ.items() returns a list of tuples: [(key1, value1), (key2, value2), ...]. The for loop iterates through this list. The first round, it assigns key1 to k and value1 to v, so k = USERPROFILE and v = C:\Documents and Settings\mpilgrim. In the second round, k gets the second key, OS, and v gets the corresponding value, Windows_NT.
                    3. With multi-variable assignment and list comprehensions, you can replace the entire for loop with a single statement. Whether you actually do this in real code is a matter of personal coding style. I like it because it makes it clear that what I'm doing is mapping a dictionary into a list, then joining the list into a single string. @@ -2530,7 +2530,7 @@ UserDict stat
                      1. The sys module contains system-level information, such as the version of Python you're running (sys.version or sys.version_info), and system-level options such as the maximum allowed recursion depth (sys.getrecursionlimit() and sys.setrecursionlimit()). -
                      2. sys.modules is a dictionary containing all the modules that have ever been imported since Python was started; the key is the module name, the value is the module object. Note that this is more than just the modules your program has imported. Python preloads some modules on startup, and if you're using a Python IDE, sys.modules contains all the modules imported by all the programs you've run within the IDE. +
                      3. sys.modules is a dictionary containing all the modules that have ever been imported since Python was started; the key is the module name, the value is the module object. Note that this is more than just the modules your program has imported. Python preloads some modules on startup, and if you're using a Python IDE, sys.modules contains all the modules imported by all the programs you've run within the IDE.

                        This example demonstrates how to use sys.modules.

                        Example 6.13. Using sys.modules

                        >>> import fileinfo         
                         >>> print '\n'.join(sys.modules.keys())
                        @@ -2604,7 +2604,7 @@ stat
                         
                      4. In this slightly less trivial case, join will add an extra backslash to the pathname before joining it to the filename. I was overjoyed when I discovered this, since addSlashIfNecessary is one of the stupid little functions I always need to write when building up my toolbox in a new language. Do not write this stupid little function in Python; smart people have already taken care of it for you.
                      5. expanduser will expand a pathname that uses ~ to represent the current user's home directory. This works on any platform where users have a home directory, like Windows, -UNIX, and Mac OS X; it has no effect on Mac OS. +UNIX, and Mac OS X; it has no effect on Mac OS.
                      6. Combining these techniques, you can easily construct pathnames for directories and files under the user's home directory.

                        Example 6.17. Splitting Pathnames

                        >>> os.path.split("c:\\music\\ap\\mahadeva.mp3")      
                         ('c:\\music\\ap', 'mahadeva.mp3')
                        @@ -2662,14 +2662,14 @@ def listDirectory(directory, fileExtList):
                                         if os.path.splitext(f)[1] in fileExtList]    
                        1. os.listdir(directory) returns a list of all the files and folders in directory. -
                        2. Iterating through the list with f, you use os.path.normcase(f) to normalize the case according to operating system defaults. normcase is a useful little function that compensates for case-insensitive operating systems that think that mahadeva.mp3 and mahadeva.MP3 are the same file. For instance, on Windows and Mac OS, normcase will convert the entire filename to lowercase; on UNIX-compatible systems, it will return the filename unchanged. +
                        3. Iterating through the list with f, you use os.path.normcase(f) to normalize the case according to operating system defaults. normcase is a useful little function that compensates for case-insensitive operating systems that think that mahadeva.mp3 and mahadeva.MP3 are the same file. For instance, on Windows and Mac OS, normcase will convert the entire filename to lowercase; on UNIX-compatible systems, it will return the filename unchanged.
                        4. Iterating through the normalized list with f again, you use os.path.splitext(f) to split each filename into name and extension.
                        5. For each file, you see if the extension is in the list of file extensions you care about (fileExtList, which was passed to the listDirectory function).
                        6. For each file you care about, you use os.path.join(directory, f) to construct the full pathname of the file, and return a list of the full pathnames.
                          NoteWhenever possible, you should use the functions in os and os.path for file, directory, and path manipulations. These modules are wrappers for platform-specific modules, so functions like -os.path.split work on UNIX, Windows, Mac OS, and any other platform supported by Python. +os.path.split work on UNIX, Windows, Mac OS, and any other platform supported by Python.

                          There is one other way to get the contents of a directory. It's very powerful, and it uses the sort of wildcards that you may already be familiar with from working on the command line.

                          Example 6.20. Listing Directories with glob

                          @@ -2735,7 +2735,7 @@ def listDirectory(directory, fileExtList):     
                           

                          Note that listDirectory is completely generic. It doesn't know ahead of time which types of files it will be getting, or which classes are defined that could potentially handle those files. It inspects the directory for the files to process, and then introspects its own module to see what special handler classes (like MP3FileInfo) are defined. You can extend this program to handle other types of files simply by defining an appropriately-named class: -HTMLFileInfo for HTML files, DOCFileInfo for Word .doc files, and so forth. listDirectory will handle them all, without modification, by handing off the real work to the appropriate classes and collating the results. +HTMLFileInfo for HTML files, DOCFileInfo for Word .doc files, and so forth. listDirectory will handle them all, without modification, by handing off the real work to the appropriate classes and collating the results.

                          6.7. Summary

                          The fileinfo.py program introduced in Chapter 5 should now make perfect sense.

                          
                          @@ -2829,10 +2829,10 @@ if __name__ == "__main__":
                           
                           
                           
                          -

                          Chapter 8. HTML Processing

                          +

                          Chapter 8. HTML Processing

                          8.1. Diving in

                          -

                          I often see questions on comp.lang.python like “How can I list all the [headers|images|links] in my HTML document?” “How do I parse/translate/munge the text of my HTML document but leave the tags alone?” “How can I add/remove/quote attributes of all my HTML tags at once?” This chapter will answer all of these questions. -

                          Here is a complete, working Python program in two parts. The first part, BaseHTMLProcessor.py, is a generic tool to help you process HTML files by walking through the tags and text blocks. The second part, dialect.py, is an example of how to use BaseHTMLProcessor.py to translate the text of an HTML document but leave the tags alone. Read the docstrings and comments to get an overview of what's going on. Most of it will seem like black magic, because it's not obvious how +

                          I often see questions on comp.lang.python like “How can I list all the [headers|images|links] in my HTML document?” “How do I parse/translate/munge the text of my HTML document but leave the tags alone?” “How can I add/remove/quote attributes of all my HTML tags at once?” This chapter will answer all of these questions. +

                          Here is a complete, working Python program in two parts. The first part, BaseHTMLProcessor.py, is a generic tool to help you process HTML files by walking through the tags and text blocks. The second part, dialect.py, is an example of how to use BaseHTMLProcessor.py to translate the text of an HTML document but leave the tags alone. Read the docstrings and comments to get an overview of what's going on. Most of it will seem like black magic, because it's not obvious how any of these class methods ever get called. Don't worry, all will be revealed in due time.

                          Example 8.1. BaseHTMLProcessor.py

                          If you have not already done so, you can download this and other examples used in this book. @@ -3061,7 +3061,7 @@ def test(url): if __name__ == "__main__": test("http://diveintopython3.org/odbchelper_list.html")

                          Example 8.3. Output of dialect.py

                          -

                          Running this script will translate Section 3.2, “Introducing Lists” into mock Swedish Chef-speak (from The Muppets), mock Elmer Fudd-speak (from Bugs Bunny cartoons), and mock Middle English (loosely based on Chaucer's The Canterbury Tales). If you look at the HTML source of the output pages, you'll see that all the HTML tags and attributes are untouched, but the text between the tags has been “translated” into the mock language. If you look closer, you'll see that, in fact, only the titles and paragraphs were translated; the +

                          Running this script will translate Section 3.2, “Introducing Lists” into mock Swedish Chef-speak (from The Muppets), mock Elmer Fudd-speak (from Bugs Bunny cartoons), and mock Middle English (loosely based on Chaucer's The Canterbury Tales). If you look at the HTML source of the output pages, you'll see that all the HTML tags and attributes are untouched, but the text between the tags has been “translated” into the mock language. If you look closer, you'll see that, in fact, only the titles and paragraphs were translated; the code listings and screen examples were left untouched.

                          
                           <div class=abstract>
                          @@ -3072,34 +3072,34 @@ in <span class=application>Powewbuiwdew</span>, bwace youwsewf fow
                           <span class=application>Pydon</span> wists.</p>
                           </div>
                           

                          8.2. Introducing sgmllib.py

                          -

                          HTML processing is broken into three steps: breaking down the HTML into its constituent pieces, fiddling with the pieces, and reconstructing the pieces into HTML again. The first step is done by sgmllib.py, a part of the standard Python library. -

                          The key to understanding this chapter is to realize that HTML is not just text, it is structured text. The structure is derived from the more-or-less-hierarchical sequence of start tags -and end tags. Usually you don't work with HTML this way; you work with it textually in a text editor, or visually in a web browser or web authoring tool. sgmllib.py presents HTML structurally. -

                          sgmllib.py contains one important class: SGMLParser. SGMLParser parses HTML into useful pieces, like start tags and end tags. As soon as it succeeds in breaking down some data into a useful piece, -it calls a method on itself based on what it found. In order to use the parser, you subclass the SGMLParser class and override these methods. This is what I meant when I said that it presents HTML structurally: the structure of the HTML determines the sequence of method calls and the arguments passed to each method. -

                          SGMLParser parses HTML into 8 kinds of data, and calls a separate method for each of them: +

                          HTML processing is broken into three steps: breaking down the HTML into its constituent pieces, fiddling with the pieces, and reconstructing the pieces into HTML again. The first step is done by sgmllib.py, a part of the standard Python library. +

                          The key to understanding this chapter is to realize that HTML is not just text, it is structured text. The structure is derived from the more-or-less-hierarchical sequence of start tags +and end tags. Usually you don't work with HTML this way; you work with it textually in a text editor, or visually in a web browser or web authoring tool. sgmllib.py presents HTML structurally. +

                          sgmllib.py contains one important class: SGMLParser. SGMLParser parses HTML into useful pieces, like start tags and end tags. As soon as it succeeds in breaking down some data into a useful piece, +it calls a method on itself based on what it found. In order to use the parser, you subclass the SGMLParser class and override these methods. This is what I meant when I said that it presents HTML structurally: the structure of the HTML determines the sequence of method calls and the arguments passed to each method. +

                          SGMLParser parses HTML into 8 kinds of data, and calls a separate method for each of them:

                          Start tag
                          -
                          An HTML tag that starts a block, like <html>, <head>, <body>, or <pre>, or a standalone tag like <br> or <img>. When it finds a start tag tagname, SGMLParser will look for a method called start_tagname or do_tagname. For instance, when it finds a <pre> tag, it will look for a start_pre or do_pre method. If found, SGMLParser calls this method with a list of the tag's attributes; otherwise, it calls unknown_starttag with the tag name and list of attributes. +
                          An HTML tag that starts a block, like <html>, <head>, <body>, or <pre>, or a standalone tag like <br> or <img>. When it finds a start tag tagname, SGMLParser will look for a method called start_tagname or do_tagname. For instance, when it finds a <pre> tag, it will look for a start_pre or do_pre method. If found, SGMLParser calls this method with a list of the tag's attributes; otherwise, it calls unknown_starttag with the tag name and list of attributes.
                          End tag
                          -
                          An HTML tag that ends a block, like </html>, </head>, </body>, or </pre>. When it finds an end tag, SGMLParser will look for a method called end_tagname. If found, SGMLParser calls this method, otherwise it calls unknown_endtag with the tag name. +
                          An HTML tag that ends a block, like </html>, </head>, </body>, or </pre>. When it finds an end tag, SGMLParser will look for a method called end_tagname. If found, SGMLParser calls this method, otherwise it calls unknown_endtag with the tag name.
                          Character reference
                          An escaped character referenced by its decimal or hexadecimal equivalent, like &#160;. When found, SGMLParser calls handle_charref with the text of the decimal or hexadecimal character equivalent.
                          Entity reference
                          -
                          An HTML entity, like &copy;. When found, SGMLParser calls handle_entityref with the name of the HTML entity. +
                          An HTML entity, like &copy;. When found, SGMLParser calls handle_entityref with the name of the HTML entity.
                          Comment
                          -
                          An HTML comment, enclosed in <!-- ... -->. When found, SGMLParser calls handle_comment with the body of the comment. +
                          An HTML comment, enclosed in <!-- ... -->. When found, SGMLParser calls handle_comment with the body of the comment.
                          Processing instruction
                          -
                          An HTML processing instruction, enclosed in <? ... >. When found, SGMLParser calls handle_pi with the body of the processing instruction. +
                          An HTML processing instruction, enclosed in <? ... >. When found, SGMLParser calls handle_pi with the body of the processing instruction.
                          Declaration
                          -
                          An HTML declaration, such as a DOCTYPE, enclosed in <! ... >. When found, SGMLParser calls handle_decl with the body of the declaration. +
                          An HTML declaration, such as a DOCTYPE, enclosed in <! ... >. When found, SGMLParser calls handle_decl with the body of the declaration.
                          Text data
                          A block of text. Anything that doesn't fit into the other 7 categories. When found, SGMLParser calls handle_data with the text. @@ -3108,13 +3108,13 @@ it calls a method on itself based on what it found. In order to use the parser,
                          ImportantPython 2.0 had a bug where SGMLParser would not recognize declarations at all (handle_decl would never be called), which meant that DOCTYPEs were silently ignored. This is fixed in Python 2.1. -

                          sgmllib.py comes with a test suite to illustrate this. You can run sgmllib.py, passing the name of an HTML file on the command line, and it will print out the tags and other elements as it parses them. It does this by subclassing +

                          sgmllib.py comes with a test suite to illustrate this. You can run sgmllib.py, passing the name of an HTML file on the command line, and it will print out the tags and other elements as it parses them. It does this by subclassing the SGMLParser class and defining unknown_starttag, unknown_endtag, handle_data and other methods which simply print their arguments. -
                          TipIn the ActivePython IDE on Windows, you can specify command line arguments in the “Run script” dialog. Separate multiple arguments with spaces. +TipIn the ActivePython IDE on Windows, you can specify command line arguments in the “Run script” dialog. Separate multiple arguments with spaces.

                          Example 8.4. Sample test of sgmllib.py

                          -

                          Here is a snippet from the table of contents of the HTML version of this book. Of course your paths may vary. (If you haven't downloaded the HTML version of the book, you can do so at http://diveintopython3.org/. +

                          Here is a snippet from the table of contents of the HTML version of this book. Of course your paths may vary. (If you haven't downloaded the HTML version of the book, you can do so at http://diveintopython3.org/.

                           c:\python23\lib> type "c:\downloads\diveintopython3\html\toc\index.html"
                           
                          @@ -3148,11 +3148,11 @@ data: '\n      '
                           

                          Here's the roadmap for the rest of the chapter:

                            -
                          • Subclass SGMLParser to create classes that extract interesting data out of HTML documents. +
                          • Subclass SGMLParser to create classes that extract interesting data out of HTML documents. -
                          • Subclass SGMLParser to create BaseHTMLProcessor, which overrides all 8 handler methods and uses them to reconstruct the original HTML from the pieces. +
                          • Subclass SGMLParser to create BaseHTMLProcessor, which overrides all 8 handler methods and uses them to reconstruct the original HTML from the pieces. -
                          • Subclass BaseHTMLProcessor to create Dialectizer, which adds some methods to process specific HTML tags specially, and overrides the handle_data method to provide a framework for processing the text blocks between the HTML tags. +
                          • Subclass BaseHTMLProcessor to create Dialectizer, which adds some methods to process specific HTML tags specially, and overrides the handle_data method to provide a framework for processing the text blocks between the HTML tags.
                          • Subclass Dialectizer to create classes that define text processing rules used by Dialectizer.handle_data. @@ -3160,9 +3160,9 @@ data: '\n '

                          Along the way, you'll also learn about locals, globals, and dictionary-based string formatting. -

                          8.3. Extracting data from HTML documents

                          -

                          To extract data from HTML documents, subclass the SGMLParser class and define methods for each tag or entity you want to capture. -

                          The first step to extracting data from an HTML document is getting some HTML. If you have some HTML lying around on your hard drive, you can use file functions to read it, but the real fun begins when you get HTML from live web pages. +

                          8.3. Extracting data from HTML documents

                          +

                          To extract data from HTML documents, subclass the SGMLParser class and define methods for each tag or entity you want to capture. +

                          The first step to extracting data from an HTML document is getting some HTML. If you have some HTML lying around on your hard drive, you can use file functions to read it, but the real fun begins when you get HTML from live web pages.

                          Example 8.5. Introducing urllib

                           >>> import urllib   
                           >>> sock = urllib.urlopen("http://diveintopython3.org/") 
                          @@ -3185,11 +3185,11 @@ data: '\n      '
                           
                           [...snip...]
                            -
                          1. The urllib module is part of the standard Python library. It contains functions for getting information about and actually retrieving data from Internet-based URLs (mainly web pages). -
                          2. The simplest use of urllib is to retrieve the entire text of a web page using the urlopen function. Opening a URL is similar to opening a file. The return value of urlopen is a file-like object, which has some of the same methods as a file object. -
                          3. The simplest thing to do with the file-like object returned by urlopen is read, which reads the entire HTML of the web page into a single string. The object also supports readlines, which reads the text line by line into a list. +
                          4. The urllib module is part of the standard Python library. It contains functions for getting information about and actually retrieving data from Internet-based URLs (mainly web pages). +
                          5. The simplest use of urllib is to retrieve the entire text of a web page using the urlopen function. Opening a URL is similar to opening a file. The return value of urlopen is a file-like object, which has some of the same methods as a file object. +
                          6. The simplest thing to do with the file-like object returned by urlopen is read, which reads the entire HTML of the web page into a single string. The object also supports readlines, which reads the text line by line into a list.
                          7. When you're done with the object, make sure to close it, just like a normal file object. -
                          8. You now have the complete HTML of the home page of http://diveintopython3.org/ in a string, and you're ready to parse it. +
                          9. You now have the complete HTML of the home page of http://diveintopython3.org/ in a string, and you're ready to parse it.

                            If you have not already done so, you can download this and other examples used in this book.

                            
                            @@ -3207,7 +3207,7 @@ class URLLister(SGMLParser):
                             
                            1. reset is called by the __init__ method of SGMLParser, and it can also be called manually once an instance of the parser has been created. So if you need to do any initialization, do it in reset, not in __init__, so that it will be re-initialized properly when someone re-uses a parser instance. -
                            2. start_a is called by SGMLParser whenever it finds an <a> tag. The tag may contain an href attribute, and/or other attributes, like name or title. The attrs parameter is a list of tuples, [(attribute, value), (attribute, value), ...]. Or it may be just an <a>, a valid (if useless) HTML tag, in which case attrs would be an empty list. +
                            3. start_a is called by SGMLParser whenever it finds an <a> tag. The tag may contain an href attribute, and/or other attributes, like name or title. The attrs parameter is a list of tuples, [(attribute, value), (attribute, value), ...]. Or it may be just an <a>, a valid (if useless) HTML tag, in which case attrs would be an empty list.
                            4. You can find out whether this <a> tag has an href attribute with a simple multi-variable list comprehension.
                            5. String comparisons like k=='href' are always case-sensitive, but that's safe in this case, because SGMLParser converts attribute names to lowercase while building attrs.

                              Example 8.7. Using urllister.py

                              @@ -3234,15 +3234,15 @@ download/diveintopython3-common-5.0.zip
                               
                               ... rest of output omitted for brevity ...
                                -
                              1. Call the feed method, defined in SGMLParser, to get HTML into the parser. +
                              2. Call the feed method, defined in SGMLParser, to get HTML into the parser. [1] It takes a string, which is what usock.read() returns. -
                              3. Like files, you should close your URL objects as soon as you're done with them. -
                              4. You should close your parser object, too, but for a different reason. You've read all the data and fed it to the parser, but the feed method isn't guaranteed to have actually processed all the HTML you give it; it may buffer it, waiting for more. Be sure to call close to flush the buffer and force everything to be fully parsed. -
                              5. Once the parser is closed, the parsing is complete, and parser.urls contains a list of all the linked URLs in the HTML document. (Your output may look different, if the download links have been updated by the time you read this.) +
                              6. Like files, you should close your URL objects as soon as you're done with them. +
                              7. You should close your parser object, too, but for a different reason. You've read all the data and fed it to the parser, but the feed method isn't guaranteed to have actually processed all the HTML you give it; it may buffer it, waiting for more. Be sure to call close to flush the buffer and force everything to be fully parsed. +
                              8. Once the parser is closed, the parsing is complete, and parser.urls contains a list of all the linked URLs in the HTML document. (Your output may look different, if the download links have been updated by the time you read this.)

                                8.4. Introducing BaseHTMLProcessor.py

                                SGMLParser doesn't produce anything by itself. It parses and parses and parses, and it calls a method for each interesting thing it - finds, but the methods don't do anything. SGMLParser is an HTML consumer: it takes HTML and breaks it down into small, structured pieces. As you saw in the previous section, you can subclass SGMLParser to define classes that catch specific tags and produce useful things, like a list of all the links on a web page. Now you'll - take this one step further by defining a class that catches everything SGMLParser throws at it and reconstructs the complete HTML document. In technical terms, this class will be an HTML producer. + finds, but the methods don't do anything. SGMLParser is an HTML consumer: it takes HTML and breaks it down into small, structured pieces. As you saw in the previous section, you can subclass SGMLParser to define classes that catch specific tags and produce useful things, like a list of all the links on a web page. Now you'll + take this one step further by defining a class that catches everything SGMLParser throws at it and reconstructs the complete HTML document. In technical terms, this class will be an HTML producer.

                                BaseHTMLProcessor subclasses SGMLParser and provides all 8 essential handler methods: unknown_starttag, unknown_endtag, handle_charref, handle_entityref, handle_comment, handle_pi, handle_decl, and handle_data.

                                Example 8.8. Introducing BaseHTMLProcessor

                                
                                 class BaseHTMLProcessor(SGMLParser):
                                @@ -3277,27 +3277,27 @@ class BaseHTMLProcessor(SGMLParser):
                                     def handle_decl(self, text):
                                         self.pieces.append("<!%(text)s>" % locals())
                                  -
                                1. reset, called by SGMLParser.__init__, initializes self.pieces as an empty list before calling the ancestor method. self.pieces is a data attribute which will hold the pieces of the HTML document you're constructing. Each handler method will reconstruct the HTML that SGMLParser parsed, and each method will append that string to self.pieces. Note that self.pieces is a list. You might be tempted to define it as a string and just keep appending each piece to it. That would work, but +
                                2. reset, called by SGMLParser.__init__, initializes self.pieces as an empty list before calling the ancestor method. self.pieces is a data attribute which will hold the pieces of the HTML document you're constructing. Each handler method will reconstruct the HTML that SGMLParser parsed, and each method will append that string to self.pieces. Note that self.pieces is a list. You might be tempted to define it as a string and just keep appending each piece to it. That would work, but Python is much more efficient at dealing with lists. -[2]
                                3. Since BaseHTMLProcessor does not define any methods for specific tags (like the start_a method in URLLister), SGMLParser will call unknown_starttag for every start tag. This method takes the tag (tag) and the list of attribute name/value pairs (attrs), reconstructs the original HTML, and appends it to self.pieces. The string formatting here is a little strange; you'll untangle that (and also the odd-looking locals function) later in this chapter. +[2]
                                4. Since BaseHTMLProcessor does not define any methods for specific tags (like the start_a method in URLLister), SGMLParser will call unknown_starttag for every start tag. This method takes the tag (tag) and the list of attribute name/value pairs (attrs), reconstructs the original HTML, and appends it to self.pieces. The string formatting here is a little strange; you'll untangle that (and also the odd-looking locals function) later in this chapter.
                                5. Reconstructing end tags is much simpler; just take the tag name and wrap it in the </...> brackets. -
                                6. When SGMLParser finds a character reference, it calls handle_charref with the bare reference. If the HTML document contains the reference &#160;, ref will be 160. Reconstructing the original complete character reference just involves wrapping ref in &#...; characters. +
                                7. When SGMLParser finds a character reference, it calls handle_charref with the bare reference. If the HTML document contains the reference &#160;, ref will be 160. Reconstructing the original complete character reference just involves wrapping ref in &#...; characters.
                                8. Entity references are similar to character references, but without the hash mark. Reconstructing the original entity reference requires wrapping ref in &...; characters. (Actually, as an erudite reader pointed out to me, it's slightly more complicated than this. Only certain standard -HTML entites end in a semicolon; other similar-looking entities do not. Luckily for us, the set of standard HTML entities is defined in a dictionary in a Python module called htmlentitydefs. Hence the extra if statement.) +HTML entites end in a semicolon; other similar-looking entities do not. Luckily for us, the set of standard HTML entities is defined in a dictionary in a Python module called htmlentitydefs. Hence the extra if statement.)
                                9. Blocks of text are simply appended to self.pieces unaltered. -
                                10. HTML comments are wrapped in <!--...--> characters. +
                                11. HTML comments are wrapped in <!--...--> characters.
                                12. Processing instructions are wrapped in <?...> characters. -
                                  ImportantThe HTML specification requires that all non-HTML (like client-side JavaScript) must be enclosed in HTML comments, but not all web pages do this properly (and all modern web browsers are forgiving if they don't). BaseHTMLProcessor is not forgiving; if script is improperly embedded, it will be parsed as if it were HTML. For instance, if the script contains less-than and equals signs, SGMLParser may incorrectly think that it has found tags and attributes. SGMLParser always converts tags and attribute names to lowercase, which may break the script, and BaseHTMLProcessor always encloses attribute values in double quotes (even if the original HTML document used single quotes or no quotes), which will certainly break the script. Always protect your client-side script - within HTML comments. +ImportantThe HTML specification requires that all non-HTML (like client-side JavaScript) must be enclosed in HTML comments, but not all web pages do this properly (and all modern web browsers are forgiving if they don't). BaseHTMLProcessor is not forgiving; if script is improperly embedded, it will be parsed as if it were HTML. For instance, if the script contains less-than and equals signs, SGMLParser may incorrectly think that it has found tags and attributes. SGMLParser always converts tags and attribute names to lowercase, which may break the script, and BaseHTMLProcessor always encloses attribute values in double quotes (even if the original HTML document used single quotes or no quotes), which will certainly break the script. Always protect your client-side script + within HTML comments.

                                  Example 8.9. BaseHTMLProcessor output

                                  
                                       def output(self):               
                                           """Return processed HTML as a single string"""
                                           return "".join(self.pieces) 
                                    -
                                  1. This is the one method in BaseHTMLProcessor that is never called by the ancestor SGMLParser. Since the other handler methods store their reconstructed HTML in self.pieces, this function is needed to join all those pieces into one string. As noted before, Python is great at lists and mediocre at strings, so you only create the complete string when somebody explicitly asks for it. +
                                  2. This is the one method in BaseHTMLProcessor that is never called by the ancestor SGMLParser. Since the other handler methods store their reconstructed HTML in self.pieces, this function is needed to join all those pieces into one string. As noted before, Python is great at lists and mediocre at strings, so you only create the complete string when somebody explicitly asks for it.
                                  3. If you prefer, you could use the join method of the string module instead: string.join(self.pieces, "")

                                    Further reading

                                      @@ -3307,7 +3307,7 @@ Python is much more efficient at dealing with lists.

                                    8.5. locals and globals

                                    -

                                    Let's digress from HTML processing for a minute and talk about how Python handles variables. Python has two built-in functions, locals and globals, which provide dictionary-based access to local and global variables. +

                                    Let's digress from HTML processing for a minute and talk about how Python handles variables. Python has two built-in functions, locals and globals, which provide dictionary-based access to local and global variables.

                                    Remember locals? You first saw it here:

                                    
                                         def unknown_starttag(self, tag, attrs):
                                    @@ -3458,8 +3458,8 @@ meaningful keys and values already. Like Important
                                  Using dictionary-based string formatting with locals is a convenient way of making complex string formatting expressions more readable, but it comes with a price. There is a slight performance hit in making the call to locals, since locals builds a copy of the local namespace.

                                  8.7. Quoting attribute values

                                  -

                                  A common question on comp.lang.python is “I have a bunch of HTML documents with unquoted attribute values, and I want to properly quote them all. How can I do this?”[4] (This is generally precipitated by a project manager who has found the HTML-is-a-standard religion joining a large project and proclaiming that all pages must validate against an HTML validator. Unquoted attribute values are a common violation of the HTML standard.) Whatever the reason, unquoted attribute values are easy to fix by feeding HTML through BaseHTMLProcessor. -

                                  BaseHTMLProcessor consumes HTML (since it's descended from SGMLParser) and produces equivalent HTML, but the HTML output is not identical to the input. Tags and attribute names will end up in lowercase, even if they started in uppercase +

                                  A common question on comp.lang.python is “I have a bunch of HTML documents with unquoted attribute values, and I want to properly quote them all. How can I do this?”[4] (This is generally precipitated by a project manager who has found the HTML-is-a-standard religion joining a large project and proclaiming that all pages must validate against an HTML validator. Unquoted attribute values are a common violation of the HTML standard.) Whatever the reason, unquoted attribute values are easy to fix by feeding HTML through BaseHTMLProcessor. +

                                  BaseHTMLProcessor consumes HTML (since it's descended from SGMLParser) and produces equivalent HTML, but the HTML output is not identical to the input. Tags and attribute names will end up in lowercase, even if they started in uppercase or mixed case, and attribute values will be enclosed in double quotes, even if they started in single quotes or with no quotes at all. It is this last side effect that you can take advantage of.

                                  Example 8.16. Quoting attribute values

                                  @@ -3492,10 +3492,10 @@ at all. It is this last side effect that you can take advantage of.
                                   </body>
                                   </html>
                                    -
                                  1. Note that the attribute values of the href attributes in the <a> tags are not properly quoted. (Also note that you're using triple quotes for something other than a docstring. And directly in the IDE, no less. They're very useful.) +
                                  2. Note that the attribute values of the href attributes in the <a> tags are not properly quoted. (Also note that you're using triple quotes for something other than a docstring. And directly in the IDE, no less. They're very useful.)
                                  3. Feed the parser.
                                  4. Using the output function defined in BaseHTMLProcessor, you get the output as a single string, complete with quoted attribute values. While this may seem anti-climactic, think - about how much has actually happened here: SGMLParser parsed the entire HTML document, breaking it down into tags, refs, data, and so forth; BaseHTMLProcessor used those elements to reconstruct pieces of HTML (which are still stored in parser.pieces, if you want to see them); finally, you called parser.output, which joined all the pieces of HTML into one string. + about how much has actually happened here: SGMLParser parsed the entire HTML document, breaking it down into tags, refs, data, and so forth; BaseHTMLProcessor used those elements to reconstruct pieces of HTML (which are still stored in parser.pieces, if you want to see them); finally, you called parser.output, which joined all the pieces of HTML into one string.

                                    8.8. Introducing dialect.py

                                    Dialectizer is a simple (and silly) descendant of BaseHTMLProcessor. It runs blocks of text through a series of substitutions, but it makes sure that anything within a <pre>...</pre> block passes through unaltered.

                                    To handle the <pre> blocks, you define two methods in Dialectizer: start_pre and end_pre. @@ -3508,7 +3508,7 @@ at all. It is this last side effect that you can take advantage of. self.unknown_endtag("pre") self.verbatim -= 1

                                      -
                                    1. start_pre is called every time SGMLParser finds a <pre> tag in the HTML source. (In a minute, you'll see exactly how this happens.) The method takes a single parameter, attrs, which contains the attributes of the tag (if any). attrs is a list of key/value tuples, just like unknown_starttag takes. +
                                    2. start_pre is called every time SGMLParser finds a <pre> tag in the HTML source. (In a minute, you'll see exactly how this happens.) The method takes a single parameter, attrs, which contains the attributes of the tag (if any). attrs is a list of key/value tuples, just like unknown_starttag takes.
                                    3. In the reset method, you initialize a data attribute that serves as a counter for <pre> tags. Every time you hit a <pre> tag, you increment the counter; every time you hit a </pre> tag, you'll decrement the counter. (You could just use this as a flag and set it to 1 and reset it to 0, but it's just as easy to do it this way, and this handles the odd (but possible) case of nested <pre> tags.) In a minute, you'll see how this counter is put to good use.
                                    4. That's it, that's the only special processing you do for <pre> tags. Now you pass the list of attributes along to unknown_starttag so it can do the default processing.
                                    5. end_pre is called every time SGMLParser finds a </pre> tag. Since end tags can not contain attributes, the method takes no parameters. @@ -3563,7 +3563,7 @@ you need to override the handle_data method.
                                    6. In the ancestor BaseHTMLProcessor, the handle_data method simply appended the text to the output buffer, self.pieces. Here the logic is only slightly more complicated. If you're in the middle of a <pre>...</pre> block, self.verbatim will be some value greater than 0, and you want to put the text in the output buffer unaltered. Otherwise, you will call a separate method to process the substitutions, then put the result of that into the output buffer. In Python, this is a one-liner, using the and-or trick.

                                      You're close to completely understanding Dialectizer. The only missing link is the nature of the text substitutions themselves. If you know any Perl, you know that when complex text substitutions are required, the only real solution is regular expressions. The classes -later in dialect.py define a series of regular expressions that operate on the text between the HTML tags. But you just had a whole chapter on regular expressions. You don't really want to slog through regular expressions again, do you? God knows I don't. I think you've learned enough +later in dialect.py define a series of regular expressions that operate on the text between the HTML tags. But you just had a whole chapter on regular expressions. You don't really want to slog through regular expressions again, do you? God knows I don't. I think you've learned enough for one chapter.

                                      8.9. Putting it all together

                                      It's time to put everything you've learned so far to good use. I hope you were paying attention. @@ -3596,7 +3596,7 @@ def translate(url, dialectName="chef"):

                                      Why bother? After all, there are only 3 Dialectizer classes; why not just use a case statement? (Well, there's no case statement in Python, but why not just use a series of if statements?) One reason: extensibility. The translate function has absolutely no idea how many Dialectizer classes you've defined. Imagine if you defined a new FooDialectizer tomorrow; translate would work by passing 'foo' as the dialectName.

                                      Even better, imagine putting FooDialectizer in a separate module, and importing it with from module import. You've already seen that this includes it in globals(), so translate would still work without modification, even though FooDialectizer was in a separate file.

                                      Now imagine that the name of the dialect is coming from somewhere outside the program, maybe from a database or from a user-inputted -value on a form. You can use any number of server-side Python scripting architectures to dynamically generate web pages; this function could take a URL and a dialect name (both strings) in the query string of a web page request, and output the “translated” web page. +value on a form. You can use any number of server-side Python scripting architectures to dynamically generate web pages; this function could take a URL and a dialect name (both strings) in the query string of a web page request, and output the “translated” web page.

                                      Finally, imagine a Dialectizer framework with a plug-in architecture. You could put each Dialectizer class in a separate file, leaving only the translate function in dialect.py. Assuming a consistent naming scheme, the translate function could dynamic import the appropiate class from the appropriate file, given nothing but the dialect name. (You haven't seen dynamic importing yet, but I promise to cover it in a later chapter.) To add a new dialect, you would simply add an appropriately-named file in the plug-ins directory (like foodialect.py which contains the FooDialectizer class). Calling the translate function with the dialect name 'foo' would find the module foodialect.py, import the class FooDialectizer, and away you go. @@ -3606,12 +3606,12 @@ appropriately-named file in the plug-ins directory (like foodialect.py③

                                        -
                                      1. After all that imagining, this is going to seem pretty boring, but the feed function is what does the entire transformation. You had the entire HTML source in a single string, so you only had to call feed once. However, you can call feed as often as you want, and the parser will just keep parsing. So if you were worried about memory usage (or you knew you - were going to be dealing with very large HTML pages), you could set this up in a loop, where you read a few bytes of HTML and fed it to the parser. The result would be the same. +
                                      2. After all that imagining, this is going to seem pretty boring, but the feed function is what does the entire transformation. You had the entire HTML source in a single string, so you only had to call feed once. However, you can call feed as often as you want, and the parser will just keep parsing. So if you were worried about memory usage (or you knew you + were going to be dealing with very large HTML pages), you could set this up in a loop, where you read a few bytes of HTML and fed it to the parser. The result would be the same.
                                      3. Because feed maintains an internal buffer, you should always call the parser's close method when you're done (even if you fed it all at once, like you did). Otherwise you may find that your output is missing the last few bytes.
                                      4. Remember, output is the function you defined on BaseHTMLProcessor that joins all the pieces of output you've buffered and returns them in a single string. -

                                        And just like that, you've “translated” a web page, given nothing but a URL and the name of a dialect. +

                                        And just like that, you've “translated” a web page, given nothing but a URL and the name of a dialect.

                                        Further reading

                                          @@ -3619,14 +3619,14 @@ appropriately-named file in the plug-ins directory (like foodialect.py

                                          8.10. Summary

                                          -

                                          Python provides you with a powerful tool, sgmllib.py, to manipulate HTML by turning its structure into an object model. You can use this tool in many different ways. +

                                          Python provides you with a powerful tool, sgmllib.py, to manipulate HTML by turning its structure into an object model. You can use this tool in many different ways.

                                            -
                                          • parsing the HTML looking for something specific +
                                          • parsing the HTML looking for something specific -
                                          • aggregating the results, like the URL lister +
                                          • aggregating the results, like the URL lister
                                          • altering the structure along the way, like the attribute quoter -
                                          • transforming the HTML into something else by manipulating the text while leaving the tags alone, like the Dialectizer +
                                          • transforming the HTML into something else by manipulating the text while leaving the tags alone, like the Dialectizer

                                          Along with these examples, you should be comfortable doing all of the following things:

                                          @@ -3638,7 +3638,7 @@ appropriately-named file in the plug-ins directory (like foodialect.py


                                          -

                                          [1] The technical term for a parser like SGMLParser is a consumer: it consumes HTML and breaks it down. Presumably, the name feed was chosen to fit into the whole “consumer” motif. Personally, it makes me think of an exhibit in the zoo where there's just a dark cage with no trees or plants or +

                                          [1] The technical term for a parser like SGMLParser is a consumer: it consumes HTML and breaks it down. Presumably, the name feed was chosen to fit into the whole “consumer” motif. Personally, it makes me think of an exhibit in the zoo where there's just a dark cage with no trees or plants or evidence of life of any kind, but if you stand perfectly still and look really closely you can make out two beady eyes staring back at you from the far left corner, but you convince yourself that that's just your mind playing tricks on you, and the only way you can tell that the whole thing isn't just an empty cage is a small innocuous sign on the railing that reads, “Do not feed the parser.” But maybe that's just me. In any event, it's an interesting mental image. @@ -3650,18 +3650,18 @@ appropriately-named file in the plug-ins directory (like foodialect.py

                                          [3] I don't get out much.

                                          -

                                          [4] All right, it's not that common a question. It's not up there with “What editor should I use to write Python code?” (answer: Emacs) or “Is Python better or worse than Perl?” (answer: “Perl is worse than Python because people wanted it worse.” -Larry Wall, 10/14/1998) But questions about HTML processing pop up in one form or another about once a month, and among those questions, this is a popular one. +

                                          [4] All right, it's not that common a question. It's not up there with “What editor should I use to write Python code?” (answer: Emacs) or “Is Python better or worse than Perl?” (answer: “Perl is worse than Python because people wanted it worse.” -Larry Wall, 10/14/1998) But questions about HTML processing pop up in one form or another about once a month, and among those questions, this is a popular one.

                                          -

                                          Chapter 9. XML Processing

                                          +

                                          Chapter 9. XML Processing

                                          9.1. Diving in

                                          -

                                          These next two chapters are about XML processing in Python. It would be helpful if you already knew what an XML document looks like, that it's made up of structured tags to form a hierarchy of elements, and so on. If this doesn't make -sense to you, there are many XML tutorials that can explain the basics. +

                                          These next two chapters are about XML processing in Python. It would be helpful if you already knew what an XML document looks like, that it's made up of structured tags to form a hierarchy of elements, and so on. If this doesn't make +sense to you, there are many XML tutorials that can explain the basics.

                                          If you're not particularly interested in XML, you should still read these chapters, which cover important topics like Python packages, Unicode, command line arguments, and how to use getattr for method dispatching.

                                          Being a philosophy major is not required, although if you have ever had the misfortune of being subjected to the writings of Immanuel Kant, you will appreciate the example program a lot more than if you majored in something useful, like computer science. -

                                          There are two basic ways to work with XML. One is called SAX (“Simple API for XML”), and it works by reading the XML a little bit at a time and calling a method for each element it finds. (If you read Chapter 8, HTML Processing, this should sound familiar, because that's how the sgmllib module works.) The other is called DOM (“Document Object Model”), and it works by reading in the entire XML document at once and creating an internal representation of it using native Python classes linked in a tree structure. Python has standard modules for both kinds of parsing, but this chapter will only deal with using the DOM. -

                                          The following is a complete Python program which generates pseudo-random output based on a context-free grammar defined in an XML format. Don't worry yet if you don't understand what that means; you'll examine both the program's input and its output +

                                          There are two basic ways to work with XML. One is called SAX (“Simple API for XML”), and it works by reading the XML a little bit at a time and calling a method for each element it finds. (If you read Chapter 8, HTML Processing, this should sound familiar, because that's how the sgmllib module works.) The other is called DOM (“Document Object Model”), and it works by reading in the entire XML document at once and creating an internal representation of it using native Python classes linked in a tree structure. Python has standard modules for both kinds of parsing, but this chapter will only deal with using the DOM. +

                                          The following is a complete Python program which generates pseudo-random output based on a context-free grammar defined in an XML format. Don't worry yet if you don't understand what that means; you'll examine both the program's input and its output in more depth throughout these next two chapters.

                                          Example 9.1. kgp.py

                                          If you have not already done so, you can download this and other examples used in this book. @@ -3953,7 +3953,7 @@ def openAnything(source): # treat source as string import StringIO return StringIO.StringIO(str(source)) -

                                          Run the program kgp.py by itself, and it will parse the default XML-based grammar, in kant.xml, and print several paragraphs worth of philosophy in the style of Immanuel Kant. +

                                          Run the program kgp.py by itself, and it will parse the default XML-based grammar, in kant.xml, and print several paragraphs worth of philosophy in the style of Immanuel Kant.

                                          Example 9.3. Sample output of kgp.py

                                          [you@localhost kgp]$ python kgp.py
                                                As is shown in the writings of Hume, our a priori concepts, in
                                           reference to ends, abstract from all content of knowledge; in the study
                                          @@ -4001,9 +4001,9 @@ completely different.
                                           10110100

                                          You will take a closer look at the structure of the grammar file later in this chapter. For now, all you need to know is that the grammar file defines the structure of the output, and the kgp.py program reads through the grammar and makes random decisions about which words to plug in where.

                                          9.2. Packages

                                          -

                                          Actually parsing an XML document is very simple: one line of code. However, before you get to that line of code, you need to take a short detour +

                                          Actually parsing an XML document is very simple: one line of code. However, before you get to that line of code, you need to take a short detour to talk about packages. -

                                          Example 9.5. Loading an XML document (a sneak peek)

                                          +

                                          Example 9.5. Loading an XML document (a sneak peek)

                                           >>> from xml.dom import minidom 
                                           >>> xmldoc = minidom.parse('~/diveintopython3/common/py/kgp/binary.xml')
                                            @@ -4052,13 +4052,13 @@ The answer is the magical __init__.py file. You see, packages are n
                                  NoteA package is a directory with the special __init__.py file in it. The __init__.py file defines the attributes and methods of the package. It doesn't need to define anything; it can just be an empty file, but it has to exist. But if __init__.py doesn't exist, the directory is just a directory, not a package, and it can't be imported or contain modules or nested packages. -

                                  So why bother with packages? Well, they provide a way to logically group related modules. Instead of having an xml package with sax and dom packages inside, the authors could have chosen to put all the sax functionality in xmlsax.py and all the dom functionality in xmldom.py, or even put all of it in a single module. But that would have been unwieldy (as of this writing, the XML package has over 3000 lines of code) and difficult to manage (separate source files mean multiple people can work on different +

                                  So why bother with packages? Well, they provide a way to logically group related modules. Instead of having an xml package with sax and dom packages inside, the authors could have chosen to put all the sax functionality in xmlsax.py and all the dom functionality in xmldom.py, or even put all of it in a single module. But that would have been unwieldy (as of this writing, the XML package has over 3000 lines of code) and difficult to manage (separate source files mean multiple people can work on different areas simultaneously).

                                  If you ever find yourself writing a large subsystem in Python (or, more likely, when you realize that your small subsystem has grown into a large one), invest some time designing a good package architecture. It's one of the many things Python is good at, so take advantage of it. -

                                  9.3. Parsing XML

                                  -

                                  As I was saying, actually parsing an XML document is very simple: one line of code. Where you go from there is up to you. -

                                  Example 9.8. Loading an XML document (for real this time)

                                  +

                                  9.3. Parsing XML

                                  +

                                  As I was saying, actually parsing an XML document is very simple: one line of code. Where you go from there is up to you. +

                                  Example 9.8. Loading an XML document (for real this time)

                                   >>> from xml.dom import minidom      
                                   >>> xmldoc = minidom.parse('~/diveintopython3/common/py/kgp/binary.xml')  
                                   >>> xmldoc         
                                  @@ -4077,11 +4077,11 @@ package architecture. It's one of the many things Python is good at, so take adv
                                   </grammar>
                                  1. As you saw in the previous section, this imports the minidom module from the xml.dom package. -
                                  2. Here is the one line of code that does all the work: minidom.parse takes one argument and returns a parsed representation of the XML document. The argument can be many things; in this case, it's simply a filename of an XML document on my local disk. (To follow along, you'll need to change the path to point to your downloaded examples directory.) +
                                  3. Here is the one line of code that does all the work: minidom.parse takes one argument and returns a parsed representation of the XML document. The argument can be many things; in this case, it's simply a filename of an XML document on my local disk. (To follow along, you'll need to change the path to point to your downloaded examples directory.) But you can also pass a file object, or even a file-like object. You'll take advantage of this flexibility later in this chapter. -
                                  4. The object returned from minidom.parse is a Document object, a descendant of the Node class. This Document object is the root level of a complex tree-like structure of interlocking Python objects that completely represent the XML document you passed to minidom.parse. -
                                  5. toxml is a method of the Node class (and is therefore available on the Document object you got from minidom.parse). toxml prints out the XML that this Node represents. For the Document node, this prints out the entire XML document. -

                                    Now that you have an XML document in memory, you can start traversing through it. +

                                  6. The object returned from minidom.parse is a Document object, a descendant of the Node class. This Document object is the root level of a complex tree-like structure of interlocking Python objects that completely represent the XML document you passed to minidom.parse. +
                                  7. toxml is a method of the Node class (and is therefore available on the Document object you got from minidom.parse). toxml prints out the XML that this Node represents. For the Document node, this prints out the entire XML document. +

                                    Now that you have an XML document in memory, you can start traversing through it.

                                    Example 9.9. Getting child nodes

                                     >>> xmldoc.childNodes    
                                     [<DOM Element: grammar at 17538908>]
                                    @@ -4090,7 +4090,7 @@ package architecture. It's one of the many things Python is good at, so take adv
                                     >>> xmldoc.firstChild    
                                     <DOM Element: grammar at 17538908>
                                      -
                                    1. Every Node has a childNodes attribute, which is a list of the Node objects. A Document always has only one child node, the root element of the XML document (in this case, the grammar element). +
                                    2. Every Node has a childNodes attribute, which is a list of the Node objects. A Document always has only one child node, the root element of the XML document (in this case, the grammar element).
                                    3. To get the first (and in this case, the only) child node, just use regular list syntax. Remember, there is nothing special going on here; this is just a regular Python list of regular Python objects.
                                    4. Since getting the first child node of a node is a useful and common activity, the Node class has a firstChild attribute, which is synonymous with childNodes[0]. (There is also a lastChild attribute, which is synonymous with childNodes[-1].) @@ -4108,7 +4108,7 @@ package architecture. It's one of the many things Python is good at, so take adv </ref> </grammar>
                                    -
                                  1. Since the toxml method is defined in the Node class, it is available on any XML node, not just the Document element. +
                                  2. Since the toxml method is defined in the Node class, it is available on any XML node, not just the Document element.

                                    Example 9.11. Child nodes can be text

                                     >>> grammarNode.childNodes
                                     [<DOM Text node "\n">, <DOM Element: ref at 17533332>, \
                                    @@ -4132,7 +4132,7 @@ package architecture. It's one of the many things Python is good at, so take adv
                                     
                                     
                                      -
                                    1. Looking at the XML in binary.xml, you might think that the grammar has only two child nodes, the two ref elements. But you're missing something: the carriage returns! After the '<grammar>' and before the first '<ref>' is a carriage return, and this text counts as a child node of the grammar element. Similarly, there is a carriage return after each '</ref>'; these also count as child nodes. So grammar.childNodes is actually a list of 5 objects: 3 Text objects and 2 Element objects. +
                                    2. Looking at the XML in binary.xml, you might think that the grammar has only two child nodes, the two ref elements. But you're missing something: the carriage returns! After the '<grammar>' and before the first '<ref>' is a carriage return, and this text counts as a child node of the grammar element. Similarly, there is a carriage return after each '</ref>'; these also count as child nodes. So grammar.childNodes is actually a list of 5 objects: 3 Text objects and 2 Element objects.
                                    3. The first child is a Text object representing the carriage return after the '<grammar>' tag and before the first '<ref>' tag.
                                    4. The second child is an Element object representing the first ref element.
                                    5. The fourth child is an Element object representing the second ref element. @@ -4163,7 +4163,7 @@ u'0'
                                    6. The p element has only one child node (you can't tell that from this example, but look at pNode.childNodes if you don't believe me), and it is a Text node for the single character '0'.
                                    7. The .data attribute of a Text node gives you the actual string that the text node represents. But what is that 'u' in front of the string? The answer to that deserves its own section.

                                      9.4. Unicode

                                      -

                                      Unicode is a system to represent characters from all the world's different languages. When Python parses an XML document, all data is stored in memory as unicode. +

                                      Unicode is a system to represent characters from all the world's different languages. When Python parses an XML document, all data is stored in memory as unicode.

                                      You'll get to all that in a minute, but first, some background.

                                      Historical note. Before unicode, there were separate character encoding systems for each language, each using the same numbers (0-255) to represent that language's characters. Some languages (like Russian) have multiple conflicting standards about how to represent the @@ -4179,14 +4179,14 @@ mode, so character 241 means something else. And so on.) These are the problems [5] Each 2-byte number represents a unique character used in at least one of the world's languages. (Characters that are used in multiple languages have the same numeric code.) There is exactly 1 number per character, and exactly 1 character per number. Unicode data is never ambiguous. -

                                      Of course, there is still the matter of all these legacy encoding systems. 7-bit ASCII, for instance, which stores English characters as numbers ranging from 0 to 127. (65 is capital “A”, 97 is lowercase “a”, and so forth.) English has a very simple alphabet, so it can be completely expressed in 7-bit ASCII. Western European languages like French, Spanish, and German all use an encoding system called ISO-8859-1 (also called “latin-1”), which uses the 7-bit ASCII characters for the numbers 0 through 127, but then extends into the 128-255 range for characters like n-with-a-tilde-over-it -(241), and u-with-two-dots-over-it (252). And unicode uses the same characters as 7-bit ASCII for 0 through 127, and the same characters as ISO-8859-1 for 128 through 255, and then extends from there into characters +

                                      Of course, there is still the matter of all these legacy encoding systems. 7-bit ASCII, for instance, which stores English characters as numbers ranging from 0 to 127. (65 is capital “A”, 97 is lowercase “a”, and so forth.) English has a very simple alphabet, so it can be completely expressed in 7-bit ASCII. Western European languages like French, Spanish, and German all use an encoding system called ISO-8859-1 (also called “latin-1”), which uses the 7-bit ASCII characters for the numbers 0 through 127, but then extends into the 128-255 range for characters like n-with-a-tilde-over-it +(241), and u-with-two-dots-over-it (252). And unicode uses the same characters as 7-bit ASCII for 0 through 127, and the same characters as ISO-8859-1 for 128 through 255, and then extends from there into characters for other languages with the remaining numbers, 256 through 65535.

                                      When dealing with unicode data, you may at some point need to convert the data back into one of these other legacy encoding systems. For instance, to integrate with some other computer system which expects its data in a specific 1-byte encoding -scheme, or to print it to a non-unicode-aware terminal or printer. Or to store it in an XML document which explicitly specifies the encoding scheme. +scheme, or to print it to a non-unicode-aware terminal or printer. Or to store it in an XML document which explicitly specifies the encoding scheme.

                                      And on that note, let's get back to Python. -

                                      Python has had unicode support throughout the language since version 2.0. The XML package uses unicode to store all parsed XML data, but you can use unicode anywhere. +

                                      Python has had unicode support throughout the language since version 2.0. The XML package uses unicode to store all parsed XML data, but you can use unicode anywhere.

                                      Example 9.13. Introducing unicode

                                       >>> s = u'Dive in'            
                                       >>> s
                                      @@ -4194,9 +4194,9 @@ u'Dive in'
                                       >>> print s 
                                       Dive in
                                        -
                                      1. To create a unicode string instead of a regular ASCII string, add the letter “u” before the string. Note that this particular string doesn't have any non-ASCII characters. That's fine; unicode is a superset of ASCII (a very large superset at that), so any regular ASCII string can also be stored as unicode. -
                                      2. When printing a string, Python will attempt to convert it to your default encoding, which is usually ASCII. (More on this in a minute.) Since this unicode string is made up of characters that are also ASCII characters, printing it has the same result as printing a normal ASCII string; the conversion is seamless, and if you didn't know that s was a unicode string, you'd never notice the difference. -

                                        Example 9.14. Storing non-ASCII characters

                                        +
                                      3. To create a unicode string instead of a regular ASCII string, add the letter “u” before the string. Note that this particular string doesn't have any non-ASCII characters. That's fine; unicode is a superset of ASCII (a very large superset at that), so any regular ASCII string can also be stored as unicode. +
                                      4. When printing a string, Python will attempt to convert it to your default encoding, which is usually ASCII. (More on this in a minute.) Since this unicode string is made up of characters that are also ASCII characters, printing it has the same result as printing a normal ASCII string; the conversion is seamless, and if you didn't know that s was a unicode string, you'd never notice the difference. +

                                        Example 9.14. Storing non-ASCII characters

                                         >>> s = u'La Pe\xf1a'         
                                         >>> print s 
                                         Traceback (innermost last):
                                        @@ -4205,11 +4205,11 @@ UnicodeError: ASCII encoding error: ordinal not in range(128)
                                         >>> print s.encode('latin-1') 
                                         La Peña
                                          -
                                        1. The real advantage of unicode, of course, is its ability to store non-ASCII characters, like the Spanish “ñ” (n with a tilde over it). The unicode character code for the tilde-n is 0xf1 in hexadecimal (241 in decimal), which you can type like this: \xf1. -
                                        2. Remember I said that the print function attempts to convert a unicode string to ASCII so it can print it? Well, that's not going to work here, because your unicode string contains non-ASCII characters, so Python raises a UnicodeError error. +
                                        3. The real advantage of unicode, of course, is its ability to store non-ASCII characters, like the Spanish “ñ” (n with a tilde over it). The unicode character code for the tilde-n is 0xf1 in hexadecimal (241 in decimal), which you can type like this: \xf1. +
                                        4. Remember I said that the print function attempts to convert a unicode string to ASCII so it can print it? Well, that's not going to work here, because your unicode string contains non-ASCII characters, so Python raises a UnicodeError error.
                                        5. Here's where the conversion-from-unicode-to-other-encoding-schemes comes in. s is a unicode string, but print can only print a regular string. To solve this problem, you call the encode method, available on every unicode string, to convert the unicode string to a regular string in the given encoding scheme, - which you pass as a parameter. In this case, you're using latin-1 (also known as iso-8859-1), which includes the tilde-n (whereas the default ASCII encoding scheme did not, since it only includes characters numbered 0 through 127). -

                                          Remember I said Python usually converted unicode to ASCII whenever it needed to make a regular string out of a unicode string? Well, this default encoding scheme is an option which + which you pass as a parameter. In this case, you're using latin-1 (also known as iso-8859-1), which includes the tilde-n (whereas the default ASCII encoding scheme did not, since it only includes characters numbered 0 through 127). +

                                          Remember I said Python usually converted unicode to ASCII whenever it needed to make a regular string out of a unicode string? Well, this default encoding scheme is an option which you can customize.

                                          Example 9.15. sitecustomize.py

                                          
                                           # sitecustomize.py 
                                          @@ -4237,15 +4237,15 @@ La Peña

                                          If you are going to be storing non-ASCII strings within your Python code, you'll need to specify the encoding of each individual .py file by putting an encoding declaration at the top of each file. This declaration defines the .py file to be UTF-8:

                                          
                                           #!/usr/bin/env python
                                           # -*- coding: UTF-8 -*-
                                          -

                                          Now, what about XML? Well, every XML document is in a specific encoding. Again, ISO-8859-1 is a popular encoding for data in Western European languages. KOI8-R -is popular for Russian texts. The encoding, if specified, is in the header of the XML document. +

                                      5. Now, what about XML? Well, every XML document is in a specific encoding. Again, ISO-8859-1 is a popular encoding for data in Western European languages. KOI8-R +is popular for Russian texts. The encoding, if specified, is in the header of the XML document.

                                        Example 9.18. russiansample.xml

                                        
                                         <?xml version="1.0" encoding="koi8-r"?>       
                                         <preface>
                                         <title>Предисловие</title>  
                                         </preface>
                                          -
                                        1. This is a sample extract from a real Russian XML document; it's part of a Russian translation of this very book. Note the encoding, koi8-r, specified in the header. +
                                        2. This is a sample extract from a real Russian XML document; it's part of a Russian translation of this very book. Note the encoding, koi8-r, specified in the header.
                                        3. These are Cyrillic characters which, as far as I know, spell the Russian word for “Preface”. If you open this file in a regular text editor, the characters will most likely like gibberish, because they're encoded using the koi8-r encoding scheme, but they're being displayed in iso-8859-1.

                                          Example 9.19. Parsing russiansample.xml

                                          @@ -4267,26 +4267,26 @@ UnicodeError: ASCII encoding error: ordinal not in range(128)
                                           
                                        4. I'm assuming here that you saved the previous example as russiansample.xml in the current directory. I am also, for the sake of completeness, assuming that you've changed your default encoding back to 'ascii' by removing your sitecustomize.py file, or at least commenting out the setdefaultencoding line.
                                        5. Note that the text data of the title tag (now in the title variable, thanks to that long concatenation of Python functions which I hastily skipped over and, annoyingly, won't explain until the next section) -- the text data inside the -XML document's title element is stored in unicode. -
                                        6. Printing the title is not possible, because this unicode string contains non-ASCII characters, so Python can't convert it to ASCII because that doesn't make sense. +XML document's title element is stored in unicode. +
                                        7. Printing the title is not possible, because this unicode string contains non-ASCII characters, so Python can't convert it to ASCII because that doesn't make sense.
                                        8. You can, however, explicitly convert it to koi8-r, in which case you get a (regular, not unicode) string of single-byte characters (f0, d2, c5, and so forth) that are the koi8-r-encoded versions of the characters in the original unicode string. -
                                        9. Printing the koi8-r-encoded string will probably show gibberish on your screen, because your Python IDE is interpreting those characters as iso-8859-1, not koi8-r. But at least they do print. (And, if you look carefully, it's the same gibberish that you saw when you opened the original -XML document in a non-unicode-aware text editor. Python converted it from koi8-r into unicode when it parsed the XML document, and you've just converted it back.) +
                                        10. Printing the koi8-r-encoded string will probably show gibberish on your screen, because your Python IDE is interpreting those characters as iso-8859-1, not koi8-r. But at least they do print. (And, if you look carefully, it's the same gibberish that you saw when you opened the original +XML document in a non-unicode-aware text editor. Python converted it from koi8-r into unicode when it parsed the XML document, and you've just converted it back.)

                                          To sum up, unicode itself is a bit intimidating if you've never seen it before, but unicode data is really very easy to handle -in Python. If your XML documents are all 7-bit ASCII (like the examples in this chapter), you will literally never think about unicode. Python will convert the ASCII data in the XML documents into unicode while parsing, and auto-coerce it back to ASCII whenever necessary, and you'll never even notice. But if you need to deal with that in other languages, Python is ready. +in Python. If your XML documents are all 7-bit ASCII (like the examples in this chapter), you will literally never think about unicode. Python will convert the ASCII data in the XML documents into unicode while parsing, and auto-coerce it back to ASCII whenever necessary, and you'll never even notice. But if you need to deal with that in other languages, Python is ready.

                                          Further reading

                                          • Unicode.org is the home page of the unicode standard, including a brief technical introduction. -
                                          • Unicode Tutorial has some more examples of how to use Python's unicode functions, including how to force Python to coerce unicode into ASCII even when it doesn't really want to. +
                                          • Unicode Tutorial has some more examples of how to use Python's unicode functions, including how to force Python to coerce unicode into ASCII even when it doesn't really want to.
                                          • PEP 263 goes into more detail about how and when to define a character encoding in your .py files.

                                          9.5. Searching for elements

                                          -

                                          Traversing XML documents by stepping through each node can be tedious. If you're looking for something in particular, buried deep within - your XML document, there is a shortcut you can use to find it quickly: getElementsByTagName. +

                                          Traversing XML documents by stepping through each node can be tedious. If you're looking for something in particular, buried deep within + your XML document, there is a shortcut you can use to find it quickly: getElementsByTagName.

                                          For this section, you'll be using the binary.xml grammar file, which looks like this:

                                          Example 9.20. binary.xml

                                          <?xml version="1.0"?>
                                           <!DOCTYPE grammar PUBLIC "-//diveintopython3.org//DTD Kant Generator Pro v1.0//EN" "kgp.dtd">
                                          @@ -4318,7 +4318,7 @@ in Python. If your XML documents are all 7-bit ASCII
                                           </ref>
                                           
                                            -
                                          1. getElementsByTagName takes one argument, the name of the element you wish to find. It returns a list of Element objects, corresponding to the XML elements that have that name. In this case, you find two ref elements. +
                                          2. getElementsByTagName takes one argument, the name of the element you wish to find. It returns a list of Element objects, corresponding to the XML elements that have that name. In this case, you find two ref elements.

                                            Example 9.22. Every element is searchable

                                             >>> firstref = reflist[0]    
                                             >>> print firstref.toxml()
                                            @@ -4349,15 +4349,15 @@ in Python. If your XML documents are all 7-bit ASCII
                                             '<p><xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/>\
                                             <xref id="bit"/><xref id="bit"/><xref id="bit"/><xref id="bit"/></p>'
                                              -
                                            1. Note carefully the difference between this and the previous example. Previously, you were searching for p elements within firstref, but here you are searching for p elements within xmldoc, the root-level object that represents the entire XML document. This does find the p elements nested within the ref elements within the root grammar element. +
                                            2. Note carefully the difference between this and the previous example. Previously, you were searching for p elements within firstref, but here you are searching for p elements within xmldoc, the root-level object that represents the entire XML document. This does find the p elements nested within the ref elements within the root grammar element.
                                            3. The first two p elements are within the first ref (the 'bit' ref).
                                            4. The last p element is the one within the second ref (the 'byte' ref).

                                              9.6. Accessing element attributes

                                              -

                                              XML elements can have one or more attributes, and it is incredibly simple to access them once you have parsed an XML document. +

                                              XML elements can have one or more attributes, and it is incredibly simple to access them once you have parsed an XML document.

                                              For this section, you'll be using the binary.xml grammar file that you saw in the previous section. -
                                              NoteThis section may be a little confusing, because of some overlapping terminology. Elements in an XML document have attributes, and Python objects also have attributes. When you parse an XML document, you get a bunch of Python objects that represent all the pieces of the XML document, and some of these Python objects represent attributes of the XML elements. But the (Python) objects that represent the (XML) attributes also have (Python) attributes, which are used to access various parts of the (XML) attribute that the object represents. I told you it was confusing. I am open to suggestions on how to distinguish these +NoteThis section may be a little confusing, because of some overlapping terminology. Elements in an XML document have attributes, and Python objects also have attributes. When you parse an XML document, you get a bunch of Python objects that represent all the pieces of the XML document, and some of these Python objects represent attributes of the XML elements. But the (Python) objects that represent the (XML) attributes also have (Python) attributes, which are used to access various parts of the (XML) attribute that the object represents. I told you it was confusing. I am open to suggestions on how to distinguish these more clearly.

                                              Example 9.24. Accessing element attributes

                                               >>> xmldoc = minidom.parse('binary.xml')
                                              @@ -4379,7 +4379,7 @@ in Python. If your XML documents are all 7-bit ASCII
                                               
                                              1. Each Element object has an attribute called attributes, which is a NamedNodeMap object. This sounds scary, but it's not, because a NamedNodeMap is an object that acts like a dictionary, so you already know how to use it.
                                              2. Treating the NamedNodeMap as a dictionary, you can get a list of the names of the attributes of this element by using attributes.keys(). This element has only one attribute, 'id'. -
                                              3. Attribute names, like all other text in an XML document, are stored in unicode. +
                                              4. Attribute names, like all other text in an XML document, are stored in unicode.
                                              5. Again treating the NamedNodeMap as a dictionary, you can get a list of the values of the attributes by using attributes.values(). The values are themselves objects, of type Attr. You'll see how to get useful information out of this object in the next example.
                                              6. Still treating the NamedNodeMap as a dictionary, you can access an individual attribute by name, using normal dictionary syntax. (Readers who have been paying extra-close attention will already know how the NamedNodeMap class accomplishes this neat trick: by defining a __getitem__ special method. Other readers can take comfort in the fact that they don't need to understand how it works in order to use it effectively.) @@ -4392,11 +4392,11 @@ u'id' >>> a.value u'bit'
                                                -
                                              1. The Attr object completely represents a single XML attribute of a single XML element. The name of the attribute (the same name as you used to find this object in the bitref.attributes NamedNodeMap pseudo-dictionary) is stored in a.name. -
                                              2. The actual text value of this XML attribute is stored in a.value. +
                                              3. The Attr object completely represents a single XML attribute of a single XML element. The name of the attribute (the same name as you used to find this object in the bitref.attributes NamedNodeMap pseudo-dictionary) is stored in a.name. +
                                              4. The actual text value of this XML attribute is stored in a.value. -
                                                NoteLike a dictionary, attributes of an XML element have no ordering. Attributes may happen to be listed in a certain order in the original XML document, and the Attr objects may happen to be listed in a certain order when the XML document is parsed into Python objects, but these orders are arbitrary and should carry no special meaning. You should always access individual attributes +NoteLike a dictionary, attributes of an XML element have no ordering. Attributes may happen to be listed in a certain order in the original XML document, and the Attr objects may happen to be listed in a certain order when the XML document is parsed into Python objects, but these orders are arbitrary and should carry no special meaning. You should always access individual attributes by name, like the keys of a dictionary.

                                                9.7. Segue

                                                OK, that's it for the hard-core XML stuff. The next chapter will continue to use these same example programs, but focus on @@ -4404,7 +4404,7 @@ u'bit'

                                                Before moving on to the next chapter, you should be comfortable doing all of these things:

                                                  -
                                                • Parsing XML documents using minidom, searching through the parsed document, and accessing arbitrary element attributes and element children +
                                                • Parsing XML documents using minidom, searching through the parsed document, and accessing arbitrary element attributes and element children
                                                • Organizing complex libraries into packages
                                                • Converting unicode strings to different character encodings @@ -4426,8 +4426,8 @@ off and returns the next chunk of data.

                                                  This is how reading from real files works; the difference is that you're not limiting yourself to real files. The input source could be anything: a file on disk, a web page, even a hard-coded string. As long as you pass a file-like object to the function, and the function simply calls the object's read method, the function can handle any kind of input source without specific code to handle each kind. -

                                                  In case you were wondering how this relates to XML processing, minidom.parse is one such function which can take a file-like object. -

                                                  Example 10.1. Parsing XML from a file

                                                  +

                                                  In case you were wondering how this relates to XML processing, minidom.parse is one such function which can take a file-like object. +

                                                  Example 10.1. Parsing XML from a file

                                                   >>> from xml.dom import minidom
                                                   >>> fsock = open('binary.xml')    
                                                   >>> xmldoc = minidom.parse(fsock) 
                                                  @@ -4446,12 +4446,12 @@ calls the object's read method, the function can handle any kind of
                                                   </grammar>
                                                  1. First, you open the file on disk. This gives you a file object. -
                                                  2. You pass the file object to minidom.parse, which calls the read method of fsock and reads the XML document from the file on disk. +
                                                  3. You pass the file object to minidom.parse, which calls the read method of fsock and reads the XML document from the file on disk.
                                                  4. Be sure to call the close method of the file object after you're done with it. minidom.parse will not do this for you. -
                                                  5. Calling the toxml() method on the returned XML document prints out the entire thing. +
                                                  6. Calling the toxml() method on the returned XML document prints out the entire thing.

                                                    Well, that all seems like a colossal waste of time. After all, you've already seen that minidom.parse can simply take the filename and do all the opening and closing nonsense automatically. And it's true that if you know you're -just going to be parsing a local file, you can pass the filename and minidom.parse is smart enough to Do The Right Thing™. But notice how similar -- and easy -- it is to parse an XML document straight from the Internet. -

                                                    Example 10.2. Parsing XML from a URL

                                                    +just going to be parsing a local file, you can pass the filename and minidom.parse is smart enough to Do The Right Thing™. But notice how similar -- and easy -- it is to parse an XML document straight from the Internet.
                                                    +

                                                    Example 10.2. Parsing XML from a URL

                                                     >>> import urllib
                                                     >>> usock = urllib.urlopen('http://slashdot.org/slashdot.rdf') 
                                                     >>> xmldoc = minidom.parse(usock)            
                                                    @@ -4480,20 +4480,20 @@ just going to be parsing a local file, you can pass the filename and minid
                                                     
                                                     [...snip...]
                                                      -
                                                    1. As you saw in a previous chapter, urlopen takes a web page URL and returns a file-like object. Most importantly, this object has a read method which returns the HTML source of the web page. -
                                                    2. Now you pass the file-like object to minidom.parse, which obediently calls the read method of the object and parses the XML data that the read method returns. The fact that this XML data is now coming straight from a web page is completely irrelevant. minidom.parse doesn't know about web pages, and it doesn't care about web pages; it just knows about file-like objects. +
                                                    3. As you saw in a previous chapter, urlopen takes a web page URL and returns a file-like object. Most importantly, this object has a read method which returns the HTML source of the web page. +
                                                    4. Now you pass the file-like object to minidom.parse, which obediently calls the read method of the object and parses the XML data that the read method returns. The fact that this XML data is now coming straight from a web page is completely irrelevant. minidom.parse doesn't know about web pages, and it doesn't care about web pages; it just knows about file-like objects.
                                                    5. As soon as you're done with it, be sure to close the file-like object that urlopen gives you. -
                                                    6. By the way, this URL is real, and it really is XML. It's an XML representation of the current headlines on Slashdot, a technical news and gossip site. -

                                                      Example 10.3. Parsing XML from a string (the easy but inflexible way)

                                                      +
                                                    7. By the way, this URL is real, and it really is XML. It's an XML representation of the current headlines on Slashdot, a technical news and gossip site. +

                                                      Example 10.3. Parsing XML from a string (the easy but inflexible way)

                                                       >>> contents = "<grammar><ref id='bit'><p>0</p><p>1</p></ref></grammar>"
                                                       >>> xmldoc = minidom.parseString(contents) 
                                                       >>> print xmldoc.toxml()
                                                       <?xml version="1.0" ?>
                                                       <grammar><ref id="bit"><p>0</p><p>1</p></ref></grammar>
                                                        -
                                                      1. minidom has a method, parseString, which takes an entire XML document as a string and parses it. You can use this instead of minidom.parse if you know you already have your entire XML document in a string. -

                                                        OK, so you can use the minidom.parse function for parsing both local files and remote URLs, but for parsing strings, you use... a different function. That means that if you want to be able to take input from a -file, a URL, or a string, you'll need special logic to check whether it's a string, and call the parseString function instead. How unsatisfying. +

                                                      2. minidom has a method, parseString, which takes an entire XML document as a string and parses it. You can use this instead of minidom.parse if you know you already have your entire XML document in a string. +

                                                        OK, so you can use the minidom.parse function for parsing both local files and remote URLs, but for parsing strings, you use... a different function. That means that if you want to be able to take input from a +file, a URL, or a string, you'll need special logic to check whether it's a string, and call the parseString function instead. How unsatisfying.

                                                        If there were a way to turn a string into a file-like object, then you could simply pass this object to minidom.parse. And in fact, there is a module specifically designed for doing just that: StringIO.

                                                        Example 10.4. Introducing StringIO

                                                         >>> contents = "<grammar><ref id='bit'><p>0</p><p>1</p></ref></grammar>"
                                                        @@ -4520,7 +4520,7 @@ file, a URL, or a string, you'll need special logic to check
                                                         
                                                      3. You can also read the string in chunks, by passing a size parameter to the read method.
                                                      4. At any time, read will return the rest of the string that you haven't read yet. All of this is exactly how file objects work; hence the term file-like object. -

                                                        Example 10.5. Parsing XML from a string (the file-like object way)

                                                        +

                                                        Example 10.5. Parsing XML from a string (the file-like object way)

                                                         >>> contents = "<grammar><ref id='bit'><p>0</p><p>1</p></ref></grammar>"
                                                         >>> ssock = StringIO.StringIO(contents)
                                                         >>> xmldoc = minidom.parse(ssock) 
                                                        @@ -4530,7 +4530,7 @@ file, a URL, or a string, you'll need special logic to check
                                                         <grammar><ref id="bit"><p>0</p><p>1</p></ref></grammar>
                                                        1. Now you can pass the file-like object (really a StringIO) to minidom.parse, which will call the object's read method and happily parse away, never knowing that its input came from a hard-coded string. -

                                                          So now you know how to use a single function, minidom.parse, to parse an XML document stored on a web page, in a local file, or in a hard-coded string. For a web page, you use urlopen to get a file-like object; for a local file, you use open; and for a string, you use StringIO. Now let's take it one step further and generalize these differences as well. +

                                                          So now you know how to use a single function, minidom.parse, to parse an XML document stored on a web page, in a local file, or in a hard-coded string. For a web page, you use urlopen to get a file-like object; for a local file, you use open; and for a string, you use StringIO. Now let's take it one step further and generalize these differences as well.

                                                          Example 10.6. openAnything

                                                          
                                                           def openAnything(source):
                                                               # try to open with urllib (if source is http, ftp, or file URL)
                                                          @@ -4550,12 +4550,12 @@ def openAnything(source):
                                                               import StringIO     
                                                               return StringIO.StringIO(str(source))  
                                                            -
                                                          1. The openAnything function takes a single parameter, source, and returns a file-like object. source is a string of some sort; it can either be a URL (like 'http://slashdot.org/slashdot.rdf'), a full or partial pathname to a local file (like 'binary.xml'), or a string that contains actual XML data to be parsed. -
                                                          2. First, you see if source is a URL. You do this through brute force: you try to open it as a URL and silently ignore errors caused by trying to open something which is not a URL. This is actually elegant in the sense that, if urllib ever supports new types of URLs in the future, you will also support them without recoding. If urllib is able to open source, then the return kicks you out of the function immediately and the following try statements never execute. -
                                                          3. On the other hand, if urllib yelled at you and told you that source wasn't a valid URL, you assume it's a path to a file on disk and try to open it. Again, you don't do anything fancy to check whether source is a valid filename or not (the rules for valid filenames vary wildly between different platforms anyway, so you'd probably +
                                                          4. The openAnything function takes a single parameter, source, and returns a file-like object. source is a string of some sort; it can either be a URL (like 'http://slashdot.org/slashdot.rdf'), a full or partial pathname to a local file (like 'binary.xml'), or a string that contains actual XML data to be parsed. +
                                                          5. First, you see if source is a URL. You do this through brute force: you try to open it as a URL and silently ignore errors caused by trying to open something which is not a URL. This is actually elegant in the sense that, if urllib ever supports new types of URLs in the future, you will also support them without recoding. If urllib is able to open source, then the return kicks you out of the function immediately and the following try statements never execute. +
                                                          6. On the other hand, if urllib yelled at you and told you that source wasn't a valid URL, you assume it's a path to a file on disk and try to open it. Again, you don't do anything fancy to check whether source is a valid filename or not (the rules for valid filenames vary wildly between different platforms anyway, so you'd probably get them wrong anyway). Instead, you just blindly open the file, and silently trap any errors.
                                                          7. By this point, you need to assume that source is a string that has hard-coded data in it (since nothing else worked), so you use StringIO to create a file-like object out of it and return that. (In fact, since you're using the str function, source doesn't even need to be a string; it could be any object, and you'll use its string representation, as defined by its __str__ special method.) -

                                                            Now you can use this openAnything function in conjunction with minidom.parse to make a function that takes a source that refers to an XML document somehow (either as a URL, or a local filename, or a hard-coded XML document in a string) and parses it. +

                                                            Now you can use this openAnything function in conjunction with minidom.parse to make a function that takes a source that refers to an XML document somehow (either as a URL, or a local filename, or a hard-coded XML document in a string) and parses it.

                                                            Example 10.7. Using openAnything in kgp.py

                                                            
                                                             class KantGenerator:
                                                                 def _load(self, source):
                                                            @@ -4563,11 +4563,11 @@ class KantGenerator:
                                                                     xmldoc = minidom.parse(sock).documentElement
                                                                     sock.close()
                                                                     return xmldoc

                                                            10.2. Standard input, output, and error

                                                            -

                                                            UNIX users are already familiar with the concept of standard input, standard output, and standard error. This section is for +

                                                            UNIX users are already familiar with the concept of standard input, standard output, and standard error. This section is for the rest of you. -

                                                            Standard output and standard error (commonly abbreviated stdout and stderr) are pipes that are built into every UNIX system. When you print something, it goes to the stdout pipe; when your program crashes and prints out debugging information (like a traceback in Python), it goes to the stderr pipe. Both of these pipes are ordinarily just connected to the terminal window where you are working, so when a program +

                                                            Standard output and standard error (commonly abbreviated stdout and stderr) are pipes that are built into every UNIX system. When you print something, it goes to the stdout pipe; when your program crashes and prints out debugging information (like a traceback in Python), it goes to the stderr pipe. Both of these pipes are ordinarily just connected to the terminal window where you are working, so when a program prints, you see the output, and when a program crashes, you see the debugging information. (If you're working on a system -with a window-based Python IDE, stdout and stderr default to your “Interactive Window”.) +with a window-based Python IDE, stdout and stderr default to your “Interactive Window”.)

                                                            Example 10.8. Introducing stdout and stderr

                                                             >>> for i in range(3):
                                                             ...    print 'Dive in'             
                                                            @@ -4584,7 +4584,7 @@ Dive inDive inDive in
                                                            1. As you saw in Example 6.9, “Simple Counters”, you can use Python's built-in range function to build simple counter loops that repeat something a set number of times.
                                                            2. stdout is a file-like object; calling its write function will print out whatever string you give it. In fact, this is what the print function really does; it adds a carriage return to the end of the string you're printing, and calls sys.stdout.write. -
                                                            3. In the simplest case, stdout and stderr send their output to the same place: the Python IDE (if you're in one), or the terminal (if you're running Python from the command line). Like stdout, stderr does not add carriage returns for you; if you want them, add them yourself. +
                                                            4. In the simplest case, stdout and stderr send their output to the same place: the Python IDE (if you're in one), or the terminal (if you're running Python from the command line). Like stdout, stderr does not add carriage returns for you; if you want them, add them yourself.

                                                              stdout and stderr are both file-like objects, like the ones you discussed in Section 10.1, “Abstracting input sources”, but they are both write-only. They have no read method, only write. Still, they are file-like objects, and you can assign any other file- or file-like object to them to redirect their output.

                                                              Example 10.9. Redirecting output

                                                               [you@localhost kgp]$ python stdout.py
                                                              @@ -4605,11 +4605,11 @@ sys.stdout = saveout 
                                                               fsock.close()        
                                                               
                                                                -
                                                              1. This will print to the IDE “Interactive Window” (or the terminal, if running the script from the command line). +
                                                              2. This will print to the IDE “Interactive Window” (or the terminal, if running the script from the command line).
                                                              3. Always save stdout before redirecting it, so you can set it back to normal later.
                                                              4. Open a file for writing. If the file doesn't exist, it will be created. If the file does exist, it will be overwritten.
                                                              5. Redirect all further output to the new file you just opened. -
                                                              6. This will be “printed” to the log file only; it will not be visible in the IDE window or on the screen. +
                                                              7. This will be “printed” to the log file only; it will not be visible in the IDE window or on the screen.
                                                              8. Set stdout back to the way it was before you mucked with it.
                                                              9. Close the log file.

                                                                Redirecting stderr works exactly the same way, using sys.stderr instead of sys.stdout. @@ -4645,7 +4645,7 @@ entering function

                                                                1. This shorthand syntax of the print statement can be used to write to any open file, or file-like object. In this case, you can redirect a single print statement to stderr without affecting subsequent print statements.

                                                                  Standard input, on the other hand, is a read-only file object, and it represents the data flowing into the program from some -previous program. This will likely not make much sense to classic Mac OS users, or even Windows users unless you were ever fluent on the MS-DOS command line. The way it works is that you can construct a chain of commands in a single line, so that one program's output +previous program. This will likely not make much sense to classic Mac OS users, or even Windows users unless you were ever fluent on the MS-DOS command line. The way it works is that you can construct a chain of commands in a single line, so that one program's output becomes the input for the next program in the chain. The first program simply outputs to standard output (without doing any special redirecting itself, just doing normal print statements or whatever), and the next program reads from standard input, and the operating system takes care of connecting one program's output to the next program's input. @@ -4693,7 +4693,7 @@ def openAnything(source):

                                                                  1. This is the openAnything function from toolbox.py, which you previously examined in Section 10.1, “Abstracting input sources”. All you've done is add three lines of code at the beginning of the function to check if the source is “-”; if so, you return sys.stdin. Really, that's it! Remember, stdin is a file-like object with a read method, so the rest of the code (in kgp.py, where you call openAnything) doesn't change a bit.

                                                                    10.3. Caching node lookups

                                                                    -

                                                                    kgp.py employs several tricks which may or may not be useful to you in your XML processing. The first one takes advantage of the consistent structure of the input documents to build a cache of nodes. +

                                                                    kgp.py employs several tricks which may or may not be useful to you in your XML processing. The first one takes advantage of the consistent structure of the input documents to build a cache of nodes.

                                                                    A grammar file defines a series of ref elements. Each ref contains one or more p elements, which can contain a lot of different things, including xrefs. Whenever you encounter an xref, you look for a corresponding ref element with the same id attribute, and choose one of the ref element's children and parse it. (You'll see how this random choice is made in the next section.)

                                                                    This is how you build up the grammar: define ref elements for the smallest pieces, then define ref elements which "include" the first ref elements by using xref, and so forth. Then you parse the "largest" reference and follow each xref, and eventually output real text. The text you output depends on the (random) decisions you make each time you fill in an xref, so the output is different each time. @@ -4708,14 +4708,14 @@ def openAnything(source):

                                                                  2. Start by creating an empty dictionary, self.refs.
                                                                  3. As you saw in Section 9.5, “Searching for elements”, getElementsByTagName returns a list of all the elements of a particular name. You easily can get a list of all the ref elements, then simply loop through that list.
                                                                  4. As you saw in Section 9.6, “Accessing element attributes”, you can access individual attributes of an element by name, using standard dictionary syntax. So the keys of the self.refs dictionary will be the values of the id attribute of each ref element. -
                                                                  5. The values of the self.refs dictionary will be the ref elements themselves. As you saw in Section 9.3, “Parsing XML”, each element, each node, each comment, each piece of text in a parsed XML document is an object. +
                                                                  6. The values of the self.refs dictionary will be the ref elements themselves. As you saw in Section 9.3, “Parsing XML”, each element, each node, each comment, each piece of text in a parsed XML document is an object.

                                                                    Once you build this cache, whenever you come across an xref and need to find the ref element with the same id attribute, you can simply look it up in self.refs.

                                                                    Example 10.15. Using the ref element cache

                                                                    
                                                                         def do_xref(self, node):
                                                                             id = node.attributes["id"].value
                                                                             self.parse(self.randomChildElement(self.refs[id]))

                                                                    You'll explore the randomChildElement function in the next section.

                                                                    10.4. Finding direct children of a node

                                                                    -

                                                                    Another useful techique when parsing XML documents is finding all the direct child elements of a particular element. For instance, in the grammar files, a ref element can have several p elements, each of which can contain many things, including other p elements. You want to find just the p elements that are children of the ref, not p elements that are children of other p elements. +

                                                                    Another useful techique when parsing XML documents is finding all the direct child elements of a particular element. For instance, in the grammar files, a ref element can have several p elements, each of which can contain many things, including other p elements. You want to find just the p elements that are children of the ref, not p elements that are children of other p elements.

                                                                    You might think you could simply use getElementsByTagName for this, but you can't. getElementsByTagName searches recursively and returns a single list for all the elements it finds. Since p elements can contain other p elements, you can't use getElementsByTagName, because it would return nested p elements that you don't want. To find only direct child elements, you'll need to do it yourself.

                                                                    Example 10.16. Finding direct child elements

                                                                    
                                                                         def randomChildElement(self, node):
                                                                    @@ -4731,8 +4731,8 @@ def openAnything(source):
                                                                                 those nodes whose nodeType is ELEMENT_NODE.
                                                                     
                                                                  7. Once you have a list of actual elements, choosing a random one is easy. Python comes with a module called random which includes several useful functions. The random.choice function takes a list of any number of items and returns a random item. For example, if the ref elements contains several p elements, then choices would be a list of p elements, and chosen would end up being assigned exactly one of them, selected at random.

                                                                    10.5. Creating separate handlers by node type

                                                                    -

                                                                    The third useful XML processing tip involves separating your code into logical functions, based on node types and element names. Parsed XML documents are made up of various types of nodes, each represented by a Python object. The root level of the document itself is represented by a Document object. The Document then contains one or more Element objects (for actual XML tags), each of which may contain other Element objects, Text objects (for bits of text), or Comment objects (for embedded comments). Python makes it easy to write a dispatcher to separate the logic for each node type. -

                                                                    Example 10.17. Class names of parsed XML objects

                                                                    +

                                                                    The third useful XML processing tip involves separating your code into logical functions, based on node types and element names. Parsed XML documents are made up of various types of nodes, each represented by a Python object. The root level of the document itself is represented by a Document object. The Document then contains one or more Element objects (for actual XML tags), each of which may contain other Element objects, Text objects (for bits of text), or Comment objects (for embedded comments). Python makes it easy to write a dispatcher to separate the logic for each node type. +

                                                                    Example 10.17. Class names of parsed XML objects

                                                                     >>> from xml.dom import minidom
                                                                     >>> xmldoc = minidom.parse('kant.xml') 
                                                                     >>> xmldoc
                                                                    @@ -4743,11 +4743,11 @@ def openAnything(source):
                                                                     'Document'
                                                                    1. Assume for a moment that kant.xml is in the current directory. -
                                                                    2. As you saw in Section 9.2, “Packages”, the object returned by parsing an XML document is a Document object, as defined in the minidom.py in the xml.dom package. As you saw in Section 5.4, “Instantiating Classes”, __class__ is built-in attribute of every Python object. +
                                                                    3. As you saw in Section 9.2, “Packages”, the object returned by parsing an XML document is a Document object, as defined in the minidom.py in the xml.dom package. As you saw in Section 5.4, “Instantiating Classes”, __class__ is built-in attribute of every Python object.
                                                                    4. Furthermore, __name__ is a built-in attribute of every Python class, and it is a string. This string is not mysterious; it's the same as the class name you type when you define a class yourself. (See Section 5.3, “Defining Classes”.) -

                                                                      Fine, so now you can get the class name of any particular XML node (since each XML node is represented as a Python object). How can you use this to your advantage to separate the logic of parsing each node type? The answer is getattr, which you first saw in Section 4.4, “Getting Object References With getattr”. -

                                                                      Example 10.18. parse, a generic XML node dispatcher

                                                                      
                                                                      +

                                                                      Fine, so now you can get the class name of any particular XML node (since each XML node is represented as a Python object). How can you use this to your advantage to separate the logic of parsing each node type? The answer is getattr, which you first saw in Section 4.4, “Getting Object References With getattr”. +

                                                                      Example 10.18. parse, a generic XML node dispatcher

                                                                      
                                                                           def parse(self, node):          
                                                                               parseMethod = getattr(self, "parse_%s" % node.__class__.__name__)  
                                                                               parseMethod(node) 
                                                                      @@ -4775,7 +4775,7 @@ def openAnything(source): handlerMethod = getattr(self, "do_%s" % node.tagName) handlerMethod(node)
                                                                        -
                                                                      1. parse_Document is only ever called once, since there is only one Document node in an XML document, and only one Document object in the parsed XML representation. It simply turns around and parses the root element of the grammar file. +
                                                                      2. parse_Document is only ever called once, since there is only one Document node in an XML document, and only one Document object in the parsed XML representation. It simply turns around and parses the root element of the grammar file.
                                                                      3. parse_Text is called on nodes that represent bits of text. The function itself does some special processing to handle automatic capitalization of the first word of a sentence, but otherwise simply appends the represented text to a list.
                                                                      4. parse_Comment is just a pass, since you don't care about embedded comments in the grammar files. Note, however, that you still need to define the function @@ -4789,7 +4789,7 @@ you could break up your code into separate modules, and use dynamic importing to you needed. Dynamic importing will be discussed in Chapter 16, Functional Programming.

                                                                        10.6. Handling command-line arguments

                                                                        Python fully supports creating programs that can be run on the command line, complete with command-line arguments and either short- - or long-style flags to specify various options. None of this is XML-specific, but this script makes good use of command-line processing, so it seemed like a good time to mention it. + or long-style flags to specify various options. None of this is XML-specific, but this script makes good use of command-line processing, so it seemed like a good time to mention it.

                                                                        It's difficult to talk about command-line processing without understanding how command-line arguments are exposed to your Python program, so let's write a simple program to see them.

                                                                        Example 10.20. Introducing sys.argv

                                                                        @@ -4846,7 +4846,7 @@ if __name__ == "__main__": command-line flags that are equivalent to the single-character versions. This is quite confusing at first glance, and is explained in more detail below.
                                                                      5. If anything goes wrong trying to parse these command-line flags, getopt will raise an exception, which you catch. You told getopt all the flags you understand, so this probably means that the end user passed some command-line flag that you don't understand. -
                                                                      6. As is standard practice in the UNIX world, when the script is passed flags it doesn't understand, you print out a summary of proper usage and exit gracefully. +
                                                                      7. As is standard practice in the UNIX world, when the script is passed flags it doesn't understand, you print out a summary of proper usage and exit gracefully. Note that I haven't shown the usage function here. You would still need to code that somewhere and have it print out the appropriate summary; it's not automatic.

                                                                        So what are all those parameters you pass to the getopt function? Well, the first one is simply the raw list of command-line flags and arguments (not including the first element, the script name, which you already chopped off before calling the main function). The second is the list of short command-line flags that the script accepts. @@ -4932,12 +4932,12 @@ def main(argv): for opt, arg in opts: ...

                                                                    You create a new instance of the KantGenerator class, and pass it the grammar file and source that may or may not have been specified on the command line.

                                                                    
                                                                    -    k = KantGenerator(grammar, source)

                                                                    The KantGenerator instance automatically loads the grammar, which is an XML file. You use your custom openAnything function to open the file (which could be stored in a local file or a remote web server), then use the built-in minidom parsing functions to parse the XML into a tree of Python objects. + k = KantGenerator(grammar, source)

                                                                  8. The KantGenerator instance automatically loads the grammar, which is an XML file. You use your custom openAnything function to open the file (which could be stored in a local file or a remote web server), then use the built-in minidom parsing functions to parse the XML into a tree of Python objects.

                                                                    
                                                                         def _load(self, source):
                                                                             sock = toolbox.openAnything(source)
                                                                             xmldoc = minidom.parse(sock).documentElement
                                                                    -        sock.close()

                                                                    Oh, and along the way, you take advantage of your knowledge of the structure of the XML document to set up a little cache of references, which are just elements in the XML document. + sock.close()

                                                        Oh, and along the way, you take advantage of your knowledge of the structure of the XML document to set up a little cache of references, which are just elements in the XML document.

                                                        
                                                             def loadGrammar(self, grammar):       
                                                                 for ref in self.grammar.getElementsByTagName("ref"):
                                                        @@ -4950,7 +4950,7 @@ the "top-level" reference (that isn't referenced by anything else) and use that
                                                                     xrefs[xref.attributes["id"].value] = 1
                                                                 xrefs = xrefs.keys()
                                                                 standaloneXrefs = [e for e in self.refs.keys() if e not in xrefs]
                                                        -        return '<xref id="%s"/>' % random.choice(standaloneXrefs)

                                                        Now you rip through the source material. The source material is also XML, and you parse it one node at a time. To keep the code separated and more maintainable, you use separate handlers for each node type. + return '<xref id="%s"/>' % random.choice(standaloneXrefs)

                                                      5. Now you rip through the source material. The source material is also XML, and you parse it one node at a time. To keep the code separated and more maintainable, you use separate handlers for each node type.

                                                        
                                                             def parse_Element(self, node): 
                                                                 handlerMethod = getattr(self, "do_%s" % node.tagName)
                                                        @@ -4977,7 +4977,7 @@ def main(argv):
                                                         ...
                                                             k = KantGenerator(grammar, source)
                                                             print k.output()

                                                        10.8. Summary

                                                        -

                                                        Python comes with powerful libraries for parsing and manipulating XML documents. The minidom takes an XML file and parses it into Python objects, providing for random access to arbitrary elements. Furthermore, this chapter shows how Python can be used to create a "real" standalone command-line script, complete with command-line flags, command-line arguments, +

                                                        Python comes with powerful libraries for parsing and manipulating XML documents. The minidom takes an XML file and parses it into Python objects, providing for random access to arbitrary elements. Furthermore, this chapter shows how Python can be used to create a "real" standalone command-line script, complete with command-line flags, command-line arguments, error handling, even the ability to take input from the piped result of a previous program.

                                                        Before moving on to the next chapter, you should be comfortable doing all of these things:

                                                        @@ -5226,7 +5226,7 @@ header: Connection: close
                                                      6. Now that the debugging flag is set, information on the the HTTP request and response is printed out in real time. The first thing it tells you is that you're connecting to the server diveintomark.org on port 80, which is the standard port for HTTP.
                                                      7. When you request the Atom feed, urllib sends three lines to the server. The first line specifies the HTTP verb you're using, and the path of the resource (minus - the domain name). All the requests in this chapter will use GET, but in the next chapter on SOAP, you'll see that it uses POST for everything. The basic syntax is the same, regardless of the verb. + the domain name). All the requests in this chapter will use GET, but in the next chapter on SOAP, you'll see that it uses POST for everything. The basic syntax is the same, regardless of the verb.
                                                      8. The second line is the Host header, which specifies the domain name of the service you're accessing. This is important, because a single HTTP server can host multiple separate domains. My server currently hosts 12 domains; other servers can host hundreds or even thousands.
                                                      9. The third line is the User-Agent header. What you see here is the generic User-Agent that the urllib library adds by default. In the next section, you'll see how to customize this to be more specific. @@ -5263,7 +5263,7 @@ header: Content-Length: 26848 header: Connection: close
                                                      1. -
                                                      2. If you still have your Python IDE open from the previous section's example, you can skip this, but this turns on HTTP debugging so you can see what you're actually sending over the wire, and what gets sent back. +
                                                      3. If you still have your Python IDE open from the previous section's example, you can skip this, but this turns on HTTP debugging so you can see what you're actually sending over the wire, and what gets sent back.
                                                      4. Fetching an HTTP resource with urllib2 is a three-step process, for good reasons that will become clear shortly. The first step is to create a Request object, which takes the URL of the resource you'll eventually get around to retrieving. Note that this step doesn't actually retrieve anything yet.
                                                      5. The second step is to build a URL opener. This can take any number of handlers, which control how responses are handled. @@ -5387,7 +5387,7 @@ class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): ①Now you can quietly open the resource, and what you get back is an object that, along with the usual headers (use seconddatastream.headers.dict to acess them), also contains the HTTP status code. In this case, as you expected, the status is 304, meaning this data hasn't changed since the last time you asked for it.
                                                      6. Note that when the server sends back a 304 status code, it doesn't re-send the data. That's the whole point: to save bandwidth by not re-downloading data that hasn't changed. So if you actually want that data, you'll need to cache it locally the first time you get it. -

                                                        Handling ETag works much the same way, but instead of checking for Last-Modified and sending If-Modified-Since, you check for ETag and send If-None-Match. Let's start with a fresh IDE session. +

                                                        Handling ETag works much the same way, but instead of checking for Last-Modified and sending If-Modified-Since, you check for ETag and send If-None-Match. Let's start with a fresh IDE session.

                                                        Example 11.9. Supporting ETag/If-None-Match

                                                         >>> import urllib2, openanything
                                                         >>> request = urllib2.Request('http://diveintomark.org/xml/atom.xml')
                                                        @@ -5812,7 +5812,7 @@ numerals. You saw the mechanics of constructing and validating Roman numerals in
                                                         
                                                        1. There is only one correct way to represent a particular number as Roman numerals. -
                                                        2. The converse is also true: if a string of characters is a valid Roman numeral, it represents only one number (i.e. it can only be read one way). +
                                                        3. The converse is also true: if a string of characters is a valid Roman numeral, it represents only one number (i.e. it can only be read one way).
                                                        4. There is a limited range of numbers that can be expressed as Roman numerals, specifically 1 through 3999. (The Romans did have several ways of expressing larger numbers, for instance by having a bar over a numeral to represent that its normal value should be multiplied by 1000, but you're not going to deal with that. For the purposes of this chapter, let's stipulate that Roman numerals go from 1 to 3999.) @@ -5840,7 +5840,7 @@ numerals. You saw the mechanics of constructing and validating Roman numerals in
                                                        5. to_roman() should always return a Roman numeral using uppercase letters. -
                                                        6. from_roman() should only accept uppercase Roman numerals (i.e. it should fail when given lowercase input). +
                                                        7. from_roman() should only accept uppercase Roman numerals (i.e. it should fail when given lowercase input).
                                                        @@ -5942,7 +5942,7 @@ class SanityCheck(unittest.TestCase):
                                                        1. to_roman() should always return a Roman numeral using uppercase letters. -
                                                        2. from_roman() should only accept uppercase Roman numerals (i.e. it should fail when given lowercase input). +
                                                        3. from_roman() should only accept uppercase Roman numerals (i.e. it should fail when given lowercase input).

                                                        In fact, they are somewhat arbitrary. You could, for instance, have stipulated that from_roman() accept lowercase and mixed case input. But they are not completely arbitrary; if to_roman() is always returning uppercase output, then from_roman() must at least accept uppercase input, or the “sanity check” (requirement #6) would fail. The fact that it only accepts uppercase input is arbitrary, but as any systems integrator will tell you, case always matters, so it's worth specifying @@ -6012,10 +6012,10 @@ def from_roman(s): have inherited each individual exception from the Exception class directly.

                                                      7. The OutOfRangeError and NotIntegerError exceptions will eventually be used by to_roman() to flag various forms of invalid input, as specified in ToRomanBadInput.
                                                      8. The InvalidRomanNumeralError exception will eventually be used by from_roman() to flag invalid input, as specified in FromRomanBadInput. -
                                                      9. At this stage, you want to define the API of each of your functions, but you don't want to code them yet, so you stub them out using the Python reserved word pass. +
                                                      10. At this stage, you want to define the API of each of your functions, but you don't want to code them yet, so you stub them out using the Python reserved word pass.

                                                        Now for the big moment (drum roll please): you're finally going to run the unit test against this stubby little module. At this point, every test case should fail. In fact, if any test case passes in stage 1, you should go back to romantest.py and re-evaluate why you coded a test so useless that it passes with do-nothing functions. -

                                                      11. At this stage, you want to define the API of each of your functions, but you don't want to code them yet, so you stub them out using the Python reserved word pass. +
                                                      12. At this stage, you want to define the API of each of your functions, but you don't want to code them yet, so you stub them out using the Python reserved word pass.

                                                        Run romantest1.py with the -v command-line option, which will give more verbose output so you can see exactly what's going on as each test case runs. With any luck, your output should look like this:

                                                        Example 14.2. Output of romantest1.py against roman1.py

                                                        from_roman should only accept uppercase input ... ERROR
                                                        diff --git a/dip3.css b/dip3.css
                                                        index 0abc379..1ddd8cc 100644
                                                        --- a/dip3.css
                                                        +++ b/dip3.css
                                                        @@ -27,15 +27,19 @@ a:visited{color:darkorchid}
                                                         .skip a:active,.skip a:focus{position:static;width:auto;height:auto}
                                                         
                                                         /* code blocks */
                                                        -pre{white-space:pre-wrap;padding-left:2.154em;line-height:2.154;border-left:1px dotted}
                                                        +pre{white-space:pre-wrap;padding-left:2.154em;line-height:2.154;border-left:1px solid gainsboro}
                                                         .widgets{float:left}
                                                         .widgets,.widgets a,.download{font-size:small;line-height:2.154}
                                                        -.block,ol{clear:left}
                                                        +.block,ol,p,blockquote{clear:left}
                                                         pre a,.widgets a{padding:0.4375em 0;border:0}
                                                         .widgets a{text-decoration:underline}
                                                         pre a:hover{border:0}
                                                         kbd{font-weight:bold}
                                                         .prompt{color:#667}
                                                        +ins,del,mark{text-decoration:none;font-style:normal;display:inline-block;width:100%;line-height:2.154}
                                                        +del{background:salmon}
                                                        +ins{background:palegreen}
                                                        +mark{background:#ffff80}
                                                         
                                                         /* tables */
                                                         table{width:100%;border-collapse:collapse}
                                                        @@ -45,7 +49,7 @@ td{vertical-align:top}
                                                         th:first-child{width:10%;text-align:center}
                                                         .simple th{font-family:inherit !important}
                                                         .hover{background:#eee;color:inherit;cursor:default}
                                                        -td pre{margin:0;padding:0;border:0}
                                                        +td pre{margin:0;padding:0;border:0;background:inherit}
                                                         
                                                         /* headers */
                                                         h1,h2,h3,p,ul,ol{margin:1.75em 0;font-size:medium}
                                                        @@ -57,3 +61,6 @@ h1{counter-reset:h2}
                                                         h2:before{counter-increment:h2;content:counter(h1) "." counter(h2) ". "}
                                                         h2{counter-reset:h3}
                                                         h3:before{counter-increment:h3;content:counter(h1) "." counter(h2) "." counter(h3) ". "}
                                                        +
                                                        +/* HTML 5 support */
                                                        +article,aside,dialog,footer,header,section{display:block}
                                                        \ No newline at end of file
                                                        diff --git a/index.html b/index.html
                                                        index ff4e168..505a0ad 100644
                                                        --- a/index.html
                                                        +++ b/index.html
                                                        @@ -3,6 +3,7 @@
                                                         
                                                         
                                                         Dive Into Python 3
                                                        +
                                                         
                                                         
                                                         
                                                        @@ -10,32 +11,33 @@
                                                         .first{clear:both;margin-top:0;padding-top:1.75em}
                                                         li:last-child{list-style:none;margin:0 0 0 -1.7em}
                                                         li:last-child:before{content:"A. \00a0 \00a0"}
                                                        +li.todo{background:white;color:gainsboro}
                                                         
                                                         
                                                         
                                                         

                                                        Dive Into Python 3 will cover Python 3 and its differences from Python 2. Compared to the original Dive Into Python, it will be about 50% revised and 50% new material. I will publish drafts online as I go. The final version will be published on paper by Apress. The book will remain online under the CC-BY-SA-3.0 license.

                                                        You can see the full table of contents (not finalized), or read what I’ve written so far:

                                                          -
                                                        1. +
                                                        2. Installing Python
                                                        3. Your first Python program
                                                        4. Native datatypes -
                                                        5. +
                                                        6. Strings
                                                        7. Regular expressions -
                                                        8. -
                                                        9. +
                                                        10. The power of introspection +
                                                        11. Objects and object-orientation
                                                        12. Unit testing -
                                                        13. -
                                                        14. -
                                                        15. -
                                                        16. -
                                                        17. -
                                                        18. -
                                                        19. -
                                                        20. -
                                                        21. -
                                                        22. -
                                                        23. -
                                                        24. +
                                                        25. Test-first programming +
                                                        26. Refactoring your code +
                                                        27. Files +
                                                        28. HTML processing +
                                                        29. XML processing +
                                                        30. Web services +
                                                        31. Dynamic functions +
                                                        32. Metaclasses +
                                                        33. Performance tuning +
                                                        34. Packaging Python libraries +
                                                        35. Creating graphics with the Python Imaging Library +
                                                        36. Where to go from here
                                                        37. Case study: porting chardet to Python 3
                                                        38. Porting code to Python 3 with 2to3
                                                        diff --git a/native-datatypes.html b/native-datatypes.html index 96b52e6..884e6ac 100644 --- a/native-datatypes.html +++ b/native-datatypes.html @@ -3,6 +3,7 @@ Native datatypes - Dive into Python 3 + diff --git a/porting-code-to-python-3-with-2to3.html b/porting-code-to-python-3-with-2to3.html index cbd8d49..15e7f17 100644 --- a/porting-code-to-python-3-with-2to3.html +++ b/porting-code-to-python-3-with-2to3.html @@ -3,6 +3,7 @@ Porting code to Python 3 with 2to3 - Dive into Python 3 + diff --git a/publish b/publish index 3c929c6..b8d585f 100644 --- a/publish +++ b/publish @@ -14,8 +14,6 @@ sed -i -e "s|//}.; /\* google\..*|});|g" build/dip3.js revision=`hg log|grep changeset|cut -d":" -f3|head -1` java -jar yuicompressor-2.4.2.jar build/dip3.js > build/dip3.$revision.min.js java -jar yuicompressor-2.4.2.jar build/dip3.css > build/dip3.$revision.min.css -#rm build/dip3.js -#rm build/dip3.css sed -i -e "s|dip3\.js|http://wearehugh.com/dip3/dip3.${revision}.min.js|g" build/*.html sed -i -e "s|dip3\.css|http://wearehugh.com/dip3/dip3.${revision}.min.css|g" build/*.html @@ -23,5 +21,5 @@ sed -i -e "s|dip3\.css|http://wearehugh.com/dip3/dip3.${revision}.min.css|g" bui chmod 644 build/*.html build/*.css build/*.js build/*.py build/*.txt build/.htaccess # and push to production -rsync -essh -avzP --delete --delete-after build/*.min.css build/*.min.js diveintomark.org:~/web/wearehugh.com/dip3/ +rsync -essh -avzP --delete --delete-after build/*.min.css build/*.min.js build/html5.js diveintomark.org:~/web/wearehugh.com/dip3/ rsync -essh -avzP build/*.html build/*.py build/*.txt build/.htaccess diveintomark.org:~/web/diveintopython3.org/ diff --git a/regular-expressions.html b/regular-expressions.html index 776d247..b55fe31 100644 --- a/regular-expressions.html +++ b/regular-expressions.html @@ -3,6 +3,7 @@ Regular expressions - Dive into Python 3 + diff --git a/table-of-contents.html b/table-of-contents.html index c8d668c..87bbccf 100644 --- a/table-of-contents.html +++ b/table-of-contents.html @@ -3,6 +3,7 @@ Table of contents - Dive Into Python 3 + diff --git a/unit-testing.html b/unit-testing.html index 758154b..94ba851 100644 --- a/unit-testing.html +++ b/unit-testing.html @@ -3,6 +3,7 @@ Unit testing - Dive into Python 3 + @@ -134,7 +135,7 @@ if __name__ == "__main__":
                                                      13. To write a test case, first subclass the TestCase class of the unittest module. This class provides many useful methods which you can use in your test case to test specific conditions.
                                                      14. This is a list of integer/numeral pairs that I verified manually. It includes the lowest ten numbers, the highest number, every number that translates to a single-character Roman numeral, and a random sampling of other valid numbers. The point of a unit test is not to test every possible input, but to test a representative sample.
                                                      15. Every individual test is its own method, which must take no parameters and return no value. If the method exits normally without raising an exception, the test is considered passed; if the method raises an exception, the test is considered failed. -
                                                      16. Here you call the actual to_roman() function. (Well, the function hasn't be written yet, but once it is, this is the line that will call it.) Notice that you have now defined the API for the to_roman() function: it must take an integer (the number to convert) and return a string (the Roman numeral representation). If the API is different than that, this test is considered failed. Also notice that you are not trapping any exceptions when you call to_roman(). This is intentional. to_roman() shouldn't raise an exception when you call it with valid input, and these input values are all valid. If to_roman() raises an exception, this test is considered failed. +
                                                      17. Here you call the actual to_roman() function. (Well, the function hasn't be written yet, but once it is, this is the line that will call it.) Notice that you have now defined the API for the to_roman() function: it must take an integer (the number to convert) and return a string (the Roman numeral representation). If the API is different than that, this test is considered failed. Also notice that you are not trapping any exceptions when you call to_roman(). This is intentional. to_roman() shouldn't raise an exception when you call it with valid input, and these input values are all valid. If to_roman() raises an exception, this test is considered failed.
                                                      18. Assuming the to_roman() function was defined correctly, called correctly, completed successfully, and returned a value, the last step is to check whether it returned the right value. This is a common question, and the TestCase class provides a method, assertEqual, to check whether two values are equal. If the result returned from to_roman() (result) does not match the known value you were expecting (numeral), assertEqual will raise an exception and the test will fail. If the two values are equal, assertEqual will do nothing. If every value returned from to_roman() matches the known value you expect, assertEqual never raises an exception, so testToRomanKnownValues eventually exits normally, which means to_roman() has passed this test.

                                                      Once you have a test case, you can start coding the to_roman() function. First, you should stub it out as an empty function and make sure the tests fail. If the tests succeed before you've written any code, you're doing it wrong — your tests aren't testing your code at all! Write a test that fails, then code until it passes. @@ -144,7 +145,7 @@ function to_roman(n): """convert integer to Roman numeral""" pass

                                                      -
                                                    1. At this stage, you want to define the API of the to_roman() function, but you don't want to code it yet. (Your test needs to fail first.) To stub it out, use the Python reserved word pass [FIXME ref], which does precisely nothing.. +
                                                    2. At this stage, you want to define the API of the to_roman() function, but you don't want to code it yet. (Your test needs to fail first.) To stub it out, use the Python reserved word pass [FIXME ref], which does precisely nothing.

                                                    Execute romantest1.py on the command line to run the test. If you call it with the -v command-line option, it will give more verbose output so you can see exactly what's going on as each test case runs. With any luck, your output should look like this:

                                                    @@ -289,7 +290,7 @@ FAILED (errors=1)

                                                    Now run the test suite again.

                                                     you@localhost:~$ python3 romantest2.py -v
                                                    -to_roman should give known result with known input ... ok
                                                    +to_roman should give known result with known input ... ok
                                                     to_roman should fail with large input ... FAIL                          
                                                     
                                                     ======================================================================
                                                    @@ -360,7 +361,7 @@ For instance, the testFromRomanCase method (“from_roman
                                                     
                                                  7. If you take a number, convert it to Roman numerals, then convert that back to a number, you should end up with the number you started with. So from_roman(to_roman(n)) == n for all n in 1..3999.
                                                  8. to_roman should always return a Roman numeral using uppercase letters. -
                                                  9. from_roman should only accept uppercase Roman numerals (i.e. it should fail when given lowercase input). +
                                                  10. from_roman should only accept uppercase Roman numerals (i.e. it should fail when given lowercase input).
                                                  -->

                                                  © 2001–4, 2009 ark Pilgrim, CC-BY-SA-3.0 diff --git a/your-first-python-program.html b/your-first-python-program.html index f2ef8ac..9513797 100644 --- a/your-first-python-program.html +++ b/your-first-python-program.html @@ -3,6 +3,7 @@ Your first Python program - Dive into Python 3 + @@ -42,6 +43,7 @@ body{counter-reset:h1 1}

                                                  Books about programming usually start with a bunch of boring chapters about fundamentals and eventually work up to building something useful. Let's skip all that. Here is a complete, working Python program. It probably makes absolutely no sense to you. Don't worry about that, because you're going to dissect it line by line. But read through it first and see what, if anything, you can make of it.

                                                  [The code examples will be easier to follow if you enable Javascript, but whatever.]

                                                  [download humansize.py]

                                                  +

                                                  skip over this code listing

                                                  SUFFIXES = {1000: ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'],
                                                               1024: ['KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']}
                                                   
                                                  @@ -70,7 +72,8 @@ def approximate_size(size, a_kilobyte_is_1024_bytes=True):
                                                   if __name__ == "__main__":
                                                       print(approximate_size(1000000000000, False))
                                                       print(approximate_size(1000000000000))
                                                  -

                                                  Now let's run this program on the command line. On Windows, it will look something like this: +

                                                  Now let's run this program on the command line. On Windows, it will look something like this: +

                                                  skip over this command output listing

                                                   c:\home\diveintopython3> c:\python30\python.exe humansize.py
                                                   1.0 TB
                                                  @@ -80,7 +83,7 @@ if __name__ == "__main__":
                                                   you@localhost:~$ python3 humansize.py
                                                   1.0 TB
                                                   931.3 GiB
                                                  - +

                                                  FIXME: this would be a good place to explain what the program, you know, actually does.

                                                  Declaring functions

                                                  Python has functions like most other languages, but it does not have separate header files like C++ or interface/implementation sections like Pascal. When you need a function, just declare it, like this:

                                                  def approximate_size(size, a_kilobyte_is_1024_bytes=True):
                                                  @@ -120,6 +123,7 @@ if __name__ == "__main__":

                                                  I won't bore you with a long finger-wagging speech about the importance of documenting your code. Just know that code is written once but read many times, and the most important audience for your code is yourself, six months after writing it (i.e. after you've forgotten everything but need to fix something). Python makes it easy to write readable code, so take advantage of it. You'll thank me in six months.

                                                  Documentation strings

                                                  You can document a Python function by giving it a documentation string (docstring for short). In this program, the approximate_size function has a docstring: +

                                                  skip over this code listing

                                                  def approximate_size(size, a_kilobyte_is_1024_bytes=True):
                                                       """Convert a file size to human-readable form.
                                                   
                                                  @@ -131,7 +135,7 @@ if __name__ == "__main__":
                                                       Returns: string
                                                   
                                                       """
                                                  -

                                                  Triple quotes signify a multi-line string. Everything between the start and end quotes is part of a single string, including carriage returns, leading white space, and other quote characters. You can use them anywhere, but you'll see them most often used when defining a docstring. +

                                                  Triple quotes signify a multi-line string. Everything between the start and end quotes is part of a single string, including carriage returns, leading white space, and other quote characters. You can use them anywhere, but you'll see them most often used when defining a docstring.

                                                  Triple quotes are also an easy way to define a string with both single and double quotes, like qq/.../ in Perl 5.

                                                  @@ -146,6 +150,7 @@ if __name__ == "__main__":

                                                  Everything is an object

                                                  In case you missed it, I just said that Python functions have attributes, and that those attributes are available at runtime. A function, like everything else in Python, is an object.

                                                  Run the interactive Python shell and follow along: +

                                                  skip over this interpreter listing

                                                   >>> import humansize                               
                                                   >>> print(humansize.approximate_size(4096, True))  
                                                  @@ -161,7 +166,7 @@ if __name__ == "__main__":
                                                       Returns: string
                                                   
                                                   
                                                  -
                                                    +
                                                    1. The first line imports the humansize program as a module -- a chunk of code that you can use interactively, or from a larger Python program. (You'll see examples of multi-module Python programs in [FIXME xref].) Once you import a module, you can reference any of its public functions, classes, or attributes. Modules can do this to access functionality in other modules, and you can do it in the Python interactive shell too. This is an important concept, and you'll see a lot more of it throughout this book.
                                                    2. When you want to use functions defined in imported modules, you need to include the module name. So you can't just say approximate_size; it must be humansize.approximate_size. If you've used classes in Java, this should feel vaguely familiar.
                                                    3. Instead of calling the function as you would expect to, you asked for one of the function's attributes, __doc__. @@ -171,6 +176,7 @@ if __name__ == "__main__":

                                                      The import search path

                                                      Before this goes any further, I want to briefly mention the library search path. Python looks in several places when you try to import a module. Specifically, it looks in all the directories defined in sys.path. This is just a list, and you can easily view it or modify it with standard list methods. (You'll learn more about lists later in this chapter.) +

                                                      skip over this interpreter listing

                                                       >>> import sys                       
                                                       >>> sys.path                         
                                                      @@ -178,7 +184,7 @@ if __name__ == "__main__":
                                                       >>> sys                              
                                                       <module 'sys' (built-in)>
                                                       >>> sys.path.append('/my/new/path')  
                                                      -
                                                        +
                                                        1. Importing the sys module makes all of its functions and attributes available.
                                                        2. sys.path is a list of directory names that constitute the current search path. (Yours will look different, depending on your operating system, what version of Python you're running, and where it was originally installed.) Python will look through these directories (in this order) for a .py file whose name matches what you're trying to import.
                                                        3. Actually, I lied; the truth is more complicated than that, because not all modules are stored as .py files. Some, like the sys module, are "built-in modules"; they are actually baked right into Python itself. Built-in modules behave just like regular modules, but their Python source code is not available, because they are not written in Python! (The sys module is written in C.) @@ -190,6 +196,7 @@ if __name__ == "__main__":

                                                          This is so important that I'm going to repeat it in case you missed it the first few times: everything in Python is an object. Strings are objects. Lists are objects. Functions are objects. Even modules are objects.

                                                          Indenting code

                                                          Python functions have no explicit begin or end, and no curly braces to mark where the function code starts and stops. The only delimiter is a colon (:) and the indentation of the code itself. +

                                                          skip over this code listing

                                                          
                                                           def approximate_size(size, a_kilobyte_is_1024_bytes=True):  
                                                               if size < 0:                                            
                                                          @@ -202,7 +209,7 @@ if __name__ == "__main__":
                                                                       return "{0:.1f} {1}".format(size, suffix)
                                                           
                                                               raise ValueError('number too large')
                                                          -
                                                            +
                                                            1. Code blocks are defined by their indentation. By "code block," I mean functions, if statements, for loops, while loops, and so forth. Indenting starts a block and unindenting ends it. There are no explicit braces, brackets, or keywords. This means that whitespace is significant, and must be consistent. In this example, the function code is indented four spaces. It doesn't need to be four spaces, it just needs to be consistent. The first line that is not indented marks the end of the function.
                                                            2. In Python, an if statement is followed by a code block. If the if expression evaluates to true, the indented block is executed, otherwise it falls to the else block (if any). (Note the lack of parentheses around the expression.)
                                                            3. This line is inside the if code block. This raise statement will raise an exception (of type ValueError), but only if size < 0. @@ -215,19 +222,22 @@ if __name__ == "__main__":

                                                              Running scripts

                                                              Python modules are objects and have several useful attributes. You can use this to easily test your modules as you write them, by including a special block of code that executes when you run the Python file on the command line. Take the last few lines of humansize.py: +

                                                              skip over this code listing

                                                              
                                                               if __name__ == "__main__":
                                                                   print(approximate_size(1000000000000, False))
                                                                   print(approximate_size(1000000000000))
                                                              -
                                                              +

                                                              Like C, Python uses == for comparison and = for assignment. Unlike C, Python does not support in-line assignment, so there's no chance of accidentally assigning the value you thought you were comparing.

                                                              So what makes this if statement special? Well, modules are objects, and all modules have a built-in attribute __name__. A module's __name__ depends on how you're using the module. If you import the module, then __name__ is the module's filename, without a directory path or file extension. +

                                                              skip over this interpreter listing

                                                               >>> import humansize
                                                               >>> humansize.__name__
                                                               'humansize'
                                                              -

                                                              But you can also run the module directly as a standalone program, in which case __name__ will be a special default value, __main__. Python will evaluate this if statement, find a true expression, and execute the if code block. In this case, to print two values. +

                                                              But you can also run the module directly as a standalone program, in which case __name__ will be a special default value, __main__. Python will evaluate this if statement, find a true expression, and execute the if code block. In this case, to print two values. +

                                                              skip over this command output listing

                                                               c:\home\diveintopython3> c:\python30\python.exe humansize.py
                                                               1.0 TB