Overview
| Comment: | Adding swaps recommendation for the bpe. |
|---|---|
| Downloads: | Tarball | ZIP archive |
| Timelines: | family | ancestors | descendants | both | bpe |
| Files: | files | file ages | folders |
| SHA1: |
a64b9a1ac35978dcd56c969252194062 |
| User & Date: | spaskalev on 2014-12-28 12:44:19.970 |
| Other Links: | branch diff | manifest | tags |
Context
|
2014-12-28
| ||
| 13:57 | Added a symbol-to-pair replacing reader in for the bpe check-in: 0083d7bfee user: spaskalev tags: bpe | |
| 12:44 | Adding swaps recommendation for the bpe. check-in: a64b9a1ac3 user: spaskalev tags: bpe | |
| 11:33 | Adding initial pieces of a prototype byte pair encoder check-in: 7f9a25c94d user: spaskalev tags: bpe | |
Changes
Modified src/0dev.org/commands/short/main.go
from [4473925a61]
to [528d5e2695].
| ︙ | ︙ | |||
12 13 14 15 16 17 18 |
f, err := os.Open(os.Args[1])
if err != nil {
os.Stderr.WriteString("Unable to open input file. " + err.Error())
os.Exit(1)
}
pairs, symbols := analyze(f)
| > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
f, err := os.Open(os.Args[1])
if err != nil {
os.Stderr.WriteString("Unable to open input file. " + err.Error())
os.Exit(1)
}
pairs, symbols := analyze(f)
rec := recommend(pairs, symbols)
fmt.Println(*rec)
}
type pair struct {
value uint16
count uint64
}
type symbol struct {
value byte
count uint64
}
type recommendation struct {
p2s map[uint16]byte
s2p map[byte]uint16
}
func recommend(pairs pairSlice, symbols symbolSlice) *recommendation {
var (
rec recommendation
pairsLength = len(pairs)
)
rec.p2s = make(map[uint16]byte) // Store pair to symbol mappings
rec.s2p = make(map[byte]uint16) // Store symbol to pair mappings
for i := 0; i < pairsLength; i++ {
currentPair := pairs[i]
// Termination condition for when we are out of symbols
if len(symbols) == 0 {
break
}
gain := currentPair.count - 4 // 4 bytes for the default header
currentSymbol := symbols[0]
if currentSymbol.count == 0 {
// Termination condition for possitive compression effect
if gain <= 0 {
break
}
// Mark the recommendation and proceed with the next pair
rec.p2s[currentPair.value] = currentSymbol.value
continue
} else { // if the current symbol is present in the data
// Decrease the gain by a symbol -> p`, p -> symbol replacement
gain -= 2 // Additional 2 bytes for the more complex header
gain -= currentSymbol.count // Account for swaping the symbol to a pair in order to free it
// Termination condition for possitive compression effect
if gain <= 0 {
break
}
// Mark this symbol for replacement by the last unused pair
rec.s2p[currentSymbol.value] = pairs[pairsLength-1].value
pairsLength--
// Mark the current pair for replacement by the current symbol
rec.p2s[currentPair.value] = currentSymbol.value
}
}
return &rec
}
// Reads the provided input and returns information about the available byte pair and used symbols
func analyze(reader io.Reader) (pairSlice, symbolSlice) {
var (
current uint16 // Stores a pair of bytes in it's high and low bits
buffer []byte = make([]byte, 1)
|
| ︙ | ︙ | |||
46 47 48 49 50 51 52 | // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) | | < | < < < < < < < < < < < | | | 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
// Store bytes frequency
symbols[value]++
}
return len(data), nil
}), reader)
// Extract and sort all byte pairs
availablePairs := make(pairSlice, 0)
for index, value := range pairs {
availablePairs = append(availablePairs, pair{value: uint16(index), count: value})
}
sort.Sort(availablePairs)
// Extract and sort all symbols (including the ones with zero counts)
allSymbols := make(symbolSlice, 0)
for index, value := range symbols {
allSymbols = append(allSymbols, symbol{value: byte(index), count: value})
}
sort.Sort(allSymbols)
return availablePairs, allSymbols
}
// Implements fmt.Stringer, used for debugging
func (p pair) String() string {
return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count)
}
type pairSlice []pair
func (s pairSlice) Len() int {
return len(s)
}
func (s pairSlice) Less(i, j int) bool {
// Sort in descending order
return s[i].count > s[j].count
}
func (s pairSlice) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
type symbolSlice []symbol
func (s symbolSlice) Len() int {
return len(s)
}
func (s symbolSlice) Less(i, j int) bool {
// Sort in ascending order
return s[i].count < s[j].count
}
func (s symbolSlice) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
|