1 import codecs, encodings
2
3 """Caller will hand this library a buffer and ask it to either convert
4 it or auto-detect the type.
5
6 Based on http://code.activestate.com/recipes/52257/
7
8 Licensed under the PSF License
9 """
10
11
12 autodetect_dict={
13 (0x00, 0x00, 0xFE, 0xFF) : ("ucs4_be"),
14 (0xFF, 0xFE, 0x00, 0x00) : ("ucs4_le"),
15 (0xFE, 0xFF, None, None) : ("utf_16_be"),
16 (0xFF, 0xFE, None, None) : ("utf_16_le"),
17 (0x00, 0x3C, 0x00, 0x3F) : ("utf_16_be"),
18 (0x3C, 0x00, 0x3F, 0x00) : ("utf_16_le"),
19 (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
20 (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
21 }
22
24 """ buffer -> encoding_name
25 The buffer should be at least 4 bytes long.
26 Returns None if encoding cannot be detected.
27 Note that encoding_name might not have an installed
28 decoder (e.g. EBCDIC)
29 """
30
31
32
33
34 encoding = "utf_8"
35
36
37
38 if len(buffer)>=4:
39 bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
40 enc_info = autodetect_dict.get(bytes, None)
41 if not enc_info:
42
43 bytes = (byte1, byte2, None, None)
44 enc_info = autodetect_dict.get(bytes)
45 else:
46 enc_info = None
47
48 if enc_info:
49 encoding = enc_info
50
51
52
53 secret_decoder_ring = codecs.lookup(encoding)[1]
54 (decoded,length) = secret_decoder_ring(buffer)
55 first_line = decoded.split("\n")[0]
56 if first_line and first_line.startswith(u"<?xml"):
57 encoding_pos = first_line.find(u"encoding")
58 if encoding_pos!=-1:
59
60 quote_pos=first_line.find('"', encoding_pos)
61
62 if quote_pos==-1:
63 quote_pos=first_line.find("'", encoding_pos)
64
65 if quote_pos>-1:
66 quote_char,rest=(first_line[quote_pos],
67 first_line[quote_pos+1:])
68 encoding=rest[:rest.find(quote_char)]
69
70 return encoding
71
75