mirror of
https://github.com/kennethreitz/tablib.git
synced 2026-06-05 15:00:19 +00:00
82 lines
3.3 KiB
Python
82 lines
3.3 KiB
Python
# -*- coding: windows-1252 -*-
|
|
|
|
'''
|
|
From BIFF8 on, strings are always stored using UTF-16LE text encoding. The
|
|
character array is a sequence of 16-bit values4. Additionally it is
|
|
possible to use a compressed format, which omits the high bytes of all
|
|
characters, if they are all zero.
|
|
|
|
The following tables describe the standard format of the entire string, but
|
|
in many records the strings differ from this format. This will be mentioned
|
|
separately. It is possible (but not required) to store Rich-Text formatting
|
|
information and Asian phonetic information inside a Unicode string. This
|
|
results in four different ways to store a string. The character array
|
|
is not zero-terminated.
|
|
|
|
The string consists of the character count (as usual an 8-bit value or
|
|
a 16-bit value), option flags, the character array and optional formatting
|
|
information. If the string is empty, sometimes the option flags field will
|
|
not occur. This is mentioned at the respective place.
|
|
|
|
Offset Size Contents
|
|
0 1 or 2 Length of the string (character count, ln)
|
|
1 or 2 1 Option flags:
|
|
Bit Mask Contents
|
|
0 01H Character compression (ccompr):
|
|
0 = Compressed (8-bit characters)
|
|
1 = Uncompressed (16-bit characters)
|
|
2 04H Asian phonetic settings (phonetic):
|
|
0 = Does not contain Asian phonetic settings
|
|
1 = Contains Asian phonetic settings
|
|
3 08H Rich-Text settings (richtext):
|
|
0 = Does not contain Rich-Text settings
|
|
1 = Contains Rich-Text settings
|
|
[2 or 3] 2 (optional, only if richtext=1) Number of Rich-Text formatting runs (rt)
|
|
[var.] 4 (optional, only if phonetic=1) Size of Asian phonetic settings block (in bytes, sz)
|
|
var. ln or
|
|
2·ln Character array (8-bit characters or 16-bit characters, dependent on ccompr)
|
|
[var.] 4·rt (optional, only if richtext=1) List of rt formatting runs
|
|
[var.] sz (optional, only if phonetic=1) Asian Phonetic Settings Block
|
|
'''
|
|
|
|
|
|
from struct import pack
|
|
|
|
def upack2(s, encoding='ascii'):
|
|
# If not unicode, make it so.
|
|
if isinstance(s, unicode):
|
|
us = s
|
|
else:
|
|
us = unicode(s, encoding)
|
|
# Limit is based on number of content characters
|
|
# (not on number of bytes in packed result)
|
|
len_us = len(us)
|
|
if len_us > 65535:
|
|
raise Exception('String longer than 65535 characters')
|
|
try:
|
|
encs = us.encode('latin1')
|
|
# Success here means all chars are in U+0000 to U+00FF
|
|
# inclusive, meaning that we can use "compressed format".
|
|
flag = 0
|
|
except UnicodeEncodeError:
|
|
encs = us.encode('utf_16_le')
|
|
flag = 1
|
|
return pack('<HB', len_us, flag) + encs
|
|
|
|
def upack1(s, encoding='ascii'):
|
|
# Same as upack2(), but with a one-byte length field.
|
|
if isinstance(s, unicode):
|
|
us = s
|
|
else:
|
|
us = unicode(s, encoding)
|
|
len_us = len(us)
|
|
if len_us > 255:
|
|
raise Exception('String longer than 255 characters')
|
|
try:
|
|
encs = us.encode('latin1')
|
|
flag = 0
|
|
except UnicodeEncodeError:
|
|
encs = us.encode('utf_16_le')
|
|
flag = 1
|
|
return pack('<BB', len_us, flag) + encs
|