Overview
Comment: | Adding swaps recommendation for the bpe. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | bpe |
Files: | files | file ages | folders |
SHA1: |
a64b9a1ac35978dcd56c969252194062 |
User & Date: | spaskalev on 2014-12-28 12:44:19 |
Other Links: | branch diff | manifest | tags |
Context
2014-12-28
| ||
13:57 | Added a symbol-to-pair replacing reader in for the bpe check-in: 0083d7bfee user: spaskalev tags: bpe | |
12:44 | Adding swaps recommendation for the bpe. check-in: a64b9a1ac3 user: spaskalev tags: bpe | |
11:33 | Adding initial pieces of a prototype byte pair encoder check-in: 7f9a25c94d user: spaskalev tags: bpe | |
Changes
Modified src/0dev.org/commands/short/main.go from [4473925a61] to [528d5e2695].
︙ | ︙ | |||
12 13 14 15 16 17 18 | f, err := os.Open(os.Args[1]) if err != nil { os.Stderr.WriteString("Unable to open input file. " + err.Error()) os.Exit(1) } pairs, symbols := analyze(f) | > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | f, err := os.Open(os.Args[1]) if err != nil { os.Stderr.WriteString("Unable to open input file. " + err.Error()) os.Exit(1) } pairs, symbols := analyze(f) rec := recommend(pairs, symbols) fmt.Println(*rec) } type pair struct { value uint16 count uint64 } type symbol struct { value byte count uint64 } type recommendation struct { p2s map[uint16]byte s2p map[byte]uint16 } func recommend(pairs pairSlice, symbols symbolSlice) *recommendation { var ( rec recommendation pairsLength = len(pairs) ) rec.p2s = make(map[uint16]byte) // Store pair to symbol mappings rec.s2p = make(map[byte]uint16) // Store symbol to pair mappings for i := 0; i < pairsLength; i++ { currentPair := pairs[i] // Termination condition for when we are out of symbols if len(symbols) == 0 { break } gain := currentPair.count - 4 // 4 bytes for the default header currentSymbol := symbols[0] if currentSymbol.count == 0 { // Termination condition for possitive compression effect if gain <= 0 { break } // Mark the recommendation and proceed with the next pair rec.p2s[currentPair.value] = currentSymbol.value continue } else { // if the current symbol is present in the data // Decrease the gain by a symbol -> p`, p -> symbol replacement gain -= 2 // Additional 2 bytes for the more complex header gain -= currentSymbol.count // Account for swaping the symbol to a pair in order to free it // Termination condition for possitive compression effect if gain <= 0 { break } // Mark this symbol for replacement by the last unused pair rec.s2p[currentSymbol.value] = pairs[pairsLength-1].value pairsLength-- // Mark the current pair for replacement by the current symbol rec.p2s[currentPair.value] = currentSymbol.value } } return &rec } // Reads the provided input and returns information about the available byte pair and used symbols func analyze(reader io.Reader) (pairSlice, symbolSlice) { var ( current uint16 // Stores a pair of bytes in it's high and low bits buffer []byte = make([]byte, 1) |
︙ | ︙ | |||
46 47 48 49 50 51 52 | // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) | | < | < < < < < < < < < < < | | | 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) // Extract and sort all byte pairs availablePairs := make(pairSlice, 0) for index, value := range pairs { availablePairs = append(availablePairs, pair{value: uint16(index), count: value}) } sort.Sort(availablePairs) // Extract and sort all symbols (including the ones with zero counts) allSymbols := make(symbolSlice, 0) for index, value := range symbols { allSymbols = append(allSymbols, symbol{value: byte(index), count: value}) } sort.Sort(allSymbols) return availablePairs, allSymbols } // Implements fmt.Stringer, used for debugging func (p pair) String() string { return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count) } type pairSlice []pair func (s pairSlice) Len() int { return len(s) } func (s pairSlice) Less(i, j int) bool { // Sort in descending order return s[i].count > s[j].count } func (s pairSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } type symbolSlice []symbol func (s symbolSlice) Len() int { return len(s) } func (s symbolSlice) Less(i, j int) bool { // Sort in ascending order return s[i].count < s[j].count } func (s symbolSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |