Changes In Branch bpe Through [a64b9a1ac3] Excluding Merge-Ins
This is equivalent to a diff from 129d90b4a8 to a64b9a1ac3
2014-12-28
| ||
17:19 | Integrate ioutil.ReadByte from the bpe branch into trunk. check-in: 00c4e0e448 user: spaskalev tags: trunk | |
13:57 | Added a symbol-to-pair replacing reader in for the bpe check-in: 0083d7bfee user: spaskalev tags: bpe | |
12:44 | Adding swaps recommendation for the bpe. check-in: a64b9a1ac3 user: spaskalev tags: bpe | |
11:33 | Adding initial pieces of a prototype byte pair encoder check-in: 7f9a25c94d user: spaskalev tags: bpe | |
2014-12-26
| ||
21:35 | Added 0dev.org/types, providing aliases that implement sort.Interface for [u]int{8|16|32|64} check-in: 129d90b4a8 user: spaskalev tags: trunk | |
16:02 | Initial implementation of commands/mtf (based on commands/pdc) check-in: e72b637df4 user: spaskalev tags: trunk | |
Added src/0dev.org/commands/short/main.go version [528d5e2695].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | package main import ( iou "0dev.org/ioutil" "fmt" "io" "os" "sort" ) func main() { f, err := os.Open(os.Args[1]) if err != nil { os.Stderr.WriteString("Unable to open input file. " + err.Error()) os.Exit(1) } pairs, symbols := analyze(f) rec := recommend(pairs, symbols) fmt.Println(*rec) } type pair struct { value uint16 count uint64 } type symbol struct { value byte count uint64 } type recommendation struct { p2s map[uint16]byte s2p map[byte]uint16 } func recommend(pairs pairSlice, symbols symbolSlice) *recommendation { var ( rec recommendation pairsLength = len(pairs) ) rec.p2s = make(map[uint16]byte) // Store pair to symbol mappings rec.s2p = make(map[byte]uint16) // Store symbol to pair mappings for i := 0; i < pairsLength; i++ { currentPair := pairs[i] // Termination condition for when we are out of symbols if len(symbols) == 0 { break } gain := currentPair.count - 4 // 4 bytes for the default header currentSymbol := symbols[0] if currentSymbol.count == 0 { // Termination condition for possitive compression effect if gain <= 0 { break } // Mark the recommendation and proceed with the next pair rec.p2s[currentPair.value] = currentSymbol.value continue } else { // if the current symbol is present in the data // Decrease the gain by a symbol -> p`, p -> symbol replacement gain -= 2 // Additional 2 bytes for the more complex header gain -= currentSymbol.count // Account for swaping the symbol to a pair in order to free it // Termination condition for possitive compression effect if gain <= 0 { break } // Mark this symbol for replacement by the last unused pair rec.s2p[currentSymbol.value] = pairs[pairsLength-1].value pairsLength-- // Mark the current pair for replacement by the current symbol rec.p2s[currentPair.value] = currentSymbol.value } } return &rec } // Reads the provided input and returns information about the available byte pair and used symbols func analyze(reader io.Reader) (pairSlice, symbolSlice) { var ( current uint16 // Stores a pair of bytes in it's high and low bits buffer []byte = make([]byte, 1) pairs []uint64 = make([]uint64, 65536) // all possible pairs, 512kb symbols []uint64 = make([]uint64, 256) // all possible characters, 2kb ) // Read the first byte and store in the low bits of the current pair if c, err := reader.Read(buffer); err != nil || c != 1 { os.Stderr.WriteString("Error reading input.") os.Exit(1) } current = uint16(buffer[0]) // Read all of the data and note the counts of bytes and byte pairs io.Copy(iou.WriterFunc(func(data []byte) (int, error) { for _, value := range data { // Store pairs frequency current <<= 8 // Shift the previous byte from low to high current |= uint16(value) // Add the current byte to low pairs[current]++ // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) // Extract and sort all byte pairs availablePairs := make(pairSlice, 0) for index, value := range pairs { availablePairs = append(availablePairs, pair{value: uint16(index), count: value}) } sort.Sort(availablePairs) // Extract and sort all symbols (including the ones with zero counts) allSymbols := make(symbolSlice, 0) for index, value := range symbols { allSymbols = append(allSymbols, symbol{value: byte(index), count: value}) } sort.Sort(allSymbols) return availablePairs, allSymbols } // Implements fmt.Stringer, used for debugging func (p pair) String() string { return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count) } type pairSlice []pair func (s pairSlice) Len() int { return len(s) } func (s pairSlice) Less(i, j int) bool { // Sort in descending order return s[i].count > s[j].count } func (s pairSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } type symbolSlice []symbol func (s symbolSlice) Len() int { return len(s) } func (s symbolSlice) Less(i, j int) bool { // Sort in ascending order return s[i].count < s[j].count } func (s symbolSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |