Changes In Branch bpe Through [879630c89c] Excluding Merge-Ins
This is equivalent to a diff from 129d90b4a8 to 879630c89c
2014-12-28
| ||
17:19 | Integrate ioutil.ReadByte from the bpe branch into trunk. check-in: 00c4e0e448 user: spaskalev tags: trunk | |
15:21 | bpe encoding flow implementation check-in: 7733ef9df8 user: spaskalev tags: bpe | |
14:14 | Added 0dev.org/ioutil.ReadByte() function and a test for it. CC at 100% check-in: 879630c89c user: spaskalev tags: bpe | |
13:57 | Added a symbol-to-pair replacing reader in for the bpe check-in: 0083d7bfee user: spaskalev tags: bpe | |
11:33 | Adding initial pieces of a prototype byte pair encoder check-in: 7f9a25c94d user: spaskalev tags: bpe | |
2014-12-26
| ||
21:35 | Added 0dev.org/types, providing aliases that implement sort.Interface for [u]int{8|16|32|64} check-in: 129d90b4a8 user: spaskalev tags: trunk | |
16:02 | Initial implementation of commands/mtf (based on commands/pdc) check-in: e72b637df4 user: spaskalev tags: trunk | |
Added src/0dev.org/commands/short/main.go version [2e2860e831].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | package main import ( iou "0dev.org/ioutil" "fmt" "io" "os" "sort" ) func main() { f, err := os.Open(os.Args[1]) if err != nil { os.Stderr.WriteString("Unable to open input file. " + err.Error()) os.Exit(1) } pairs, symbols := analyze(f) rec := recommend(pairs, symbols) fmt.Println(*rec) } type recommendation struct { p2s map[uint16]byte s2p map[byte]uint16 } func apply(rec *recommendation, reader io.Reader) { symbolReader := iou.SizedReader(iou.ReaderFunc(func(output []byte) (int, error) { var i int = 0 for ; i < len(output)-1; i++ { // Read a byte from the underlying reader count, err := reader.Read(output[i : i+1]) // If we can't read anything else - return immediatelly if count == 0 { return i, err } // Convert the byte to a pair if there is a mapping for it if pair, ok := rec.s2p[output[i]]; ok { output[i] = byte(pair >> 8) // extract the high byte from the pair i++ output[i] = byte(pair) // leave only the low byte from the pair } // Return on error if err != nil { return i + 1, err } } return i + 1, nil }), 2) pairReader := iou.ReaderFunc(func(output []byte) (int, error) { for i := 0; i < len(output); i++ { } }) } func recommend(pairs pairSlice, symbols symbolSlice) *recommendation { var ( rec recommendation pairsLength = len(pairs) ) rec.p2s = make(map[uint16]byte) // Store pair to symbol mappings rec.s2p = make(map[byte]uint16) // Store symbol to pair mappings for i := 0; i < pairsLength; i++ { currentPair := pairs[i] // Termination condition for when we are out of symbols if len(symbols) == 0 { break } gain := currentPair.count - 4 // 4 bytes for the default header currentSymbol := symbols[0] if currentSymbol.count == 0 { // Termination condition for possitive compression effect if gain <= 0 { break } // Mark the recommendation and proceed with the next pair rec.p2s[currentPair.value] = currentSymbol.value continue } else { // if the current symbol is present in the data // Decrease the gain by a symbol -> p`, p -> symbol replacement gain -= 2 // Additional 2 bytes for the more complex header gain -= currentSymbol.count // Account for swaping the symbol to a pair in order to free it // Termination condition for possitive compression effect if gain <= 0 { break } // Mark this symbol for replacement by the last unused pair rec.s2p[currentSymbol.value] = pairs[pairsLength-1].value pairsLength-- // Mark the current pair for replacement by the current symbol rec.p2s[currentPair.value] = currentSymbol.value } } return &rec } // Reads the provided input and returns information about the available byte pair and used symbols func analyze(reader io.Reader) (pairSlice, symbolSlice) { var ( current uint16 // Stores a pair of bytes in it's high and low bits buffer []byte = make([]byte, 1) pairs []uint64 = make([]uint64, 65536) // all possible pairs, 512kb symbols []uint64 = make([]uint64, 256) // all possible characters, 2kb ) // Read the first byte and store in the low bits of the current pair if c, err := reader.Read(buffer); err != nil || c != 1 { os.Stderr.WriteString("Error reading input.") os.Exit(1) } current = uint16(buffer[0]) // Read all of the data and note the counts of bytes and byte pairs io.Copy(iou.WriterFunc(func(data []byte) (int, error) { for _, value := range data { // Store pairs frequency current <<= 8 // Shift the previous byte from low to high current |= uint16(value) // Add the current byte to low pairs[current]++ // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) // Extract and sort all byte pairs availablePairs := make(pairSlice, 0) for index, value := range pairs { availablePairs = append(availablePairs, pair{value: uint16(index), count: value}) } sort.Sort(availablePairs) // Extract and sort all symbols (including the ones with zero counts) allSymbols := make(symbolSlice, 0) for index, value := range symbols { allSymbols = append(allSymbols, symbol{value: byte(index), count: value}) } sort.Sort(allSymbols) return availablePairs, allSymbols } type pair struct { value uint16 count uint64 } type symbol struct { value byte count uint64 } // Implements fmt.Stringer, used for debugging func (p pair) String() string { return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count) } type pairSlice []pair func (s pairSlice) Len() int { return len(s) } func (s pairSlice) Less(i, j int) bool { // Sort in descending order return s[i].count > s[j].count } func (s pairSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } type symbolSlice []symbol func (s symbolSlice) Len() int { return len(s) } func (s symbolSlice) Less(i, j int) bool { // Sort in ascending order return s[i].count < s[j].count } func (s symbolSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |
Modified src/0dev.org/ioutil/ioutil.go from [c11fdb5a53] to [83fc64874b].
︙ | ︙ | |||
16 17 18 19 20 21 22 23 24 25 26 27 28 29 | // An function alias type that implements io.Reader. type ReaderFunc func([]byte) (int, error) // Delegates the call to the WriterFunc while implementing io.Reader. func (r ReaderFunc) Read(b []byte) (int, error) { return r(b) } // Returns a writer that delegates calls to Write(...) while ensuring // that it is never called with less bytes than the specified amount. // // Calls with fewer bytes are buffered while a call with a nil slice // causes the buffer to be flushed to the underlying writer. func SizedWriter(writer io.Writer, size int) io.Writer { | > > > > > > > > > > | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | // An function alias type that implements io.Reader. type ReaderFunc func([]byte) (int, error) // Delegates the call to the WriterFunc while implementing io.Reader. func (r ReaderFunc) Read(b []byte) (int, error) { return r(b) } // Reads a single byte from the provided io.Reader func ReadByte(reader io.Reader) (byte, error) { var ( arr [1]byte err error ) _, err = reader.Read(arr[:]) return arr[0], err } // Returns a writer that delegates calls to Write(...) while ensuring // that it is never called with less bytes than the specified amount. // // Calls with fewer bytes are buffered while a call with a nil slice // causes the buffer to be flushed to the underlying writer. func SizedWriter(writer io.Writer, size int) io.Writer { |
︙ | ︙ |
Modified src/0dev.org/ioutil/ioutil_test.go from [91aaac4713] to [514a112f00].
︙ | ︙ | |||
43 44 45 46 47 48 49 50 51 52 53 54 55 56 | // Diff the result against the initial input delta := diff.Diff(diff.D{len(input), len(output), func(i, j int) bool { return input[i] == output[j] }}) if len(delta.Added) > 0 || len(delta.Removed) > 0 { t.Error("Differences detected ", delta) } } func TestSizedWriter(t *testing.T) { var ( buffer bytes.Buffer writer io.Writer = SizedWriter(&buffer, 4) ) | > > > > > > > > > > > > > > > > > > > > | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | // Diff the result against the initial input delta := diff.Diff(diff.D{len(input), len(output), func(i, j int) bool { return input[i] == output[j] }}) if len(delta.Added) > 0 || len(delta.Removed) > 0 { t.Error("Differences detected ", delta) } } func TestReadByte(t *testing.T) { var ( input []byte = []byte{255} reader *bytes.Reader = bytes.NewReader(input) ) result, err := ReadByte(reader) if result != input[0] { t.Error("Unexpected read result from ReadByte", result) } if err != nil { t.Error("Unexpected error from ReadByte", err) } result, err = ReadByte(reader) if err != io.EOF { t.Error("Unexpected nil error from ReadByte, read value:", result) } } func TestSizedWriter(t *testing.T) { var ( buffer bytes.Buffer writer io.Writer = SizedWriter(&buffer, 4) ) |
︙ | ︙ |