Changes In Branch bpe Excluding Merge-Ins
This is equivalent to a diff from 129d90b4a8 to 4fe28f11ac
2014-12-28
| ||
17:19 | Integrate ioutil.ReadByte from the bpe branch into trunk. check-in: 00c4e0e448 user: spaskalev tags: trunk | |
17:16 | Closing the branch, this particular implementation is unfruitfull :) Closed-Leaf check-in: 4fe28f11ac user: spaskalev tags: bpe | |
15:54 | a testable bpe implementation that encodes and decodes in a single execution check-in: a06f897887 user: spaskalev tags: bpe | |
11:33 | Adding initial pieces of a prototype byte pair encoder check-in: 7f9a25c94d user: spaskalev tags: bpe | |
2014-12-26
| ||
21:35 | Added 0dev.org/types, providing aliases that implement sort.Interface for [u]int{8|16|32|64} check-in: 129d90b4a8 user: spaskalev tags: trunk | |
16:02 | Initial implementation of commands/mtf (based on commands/pdc) check-in: e72b637df4 user: spaskalev tags: trunk | |
Added src/0dev.org/commands/short/main.go version [0cc7fcc383].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | package main import ( iou "0dev.org/ioutil" "fmt" "io" // "io/ioutil" "bytes" "os" "sort" ) func main() { f, err := os.Open(os.Args[1]) if err != nil { os.Stderr.WriteString("Unable to open input file. " + err.Error()) os.Exit(1) } pairs, symbols := analyze(f) _, err = f.Seek(0, 0) if err != nil { os.Stderr.WriteString("Unable to reset input file position. " + err.Error()) os.Exit(1) } rec := recommend(pairs, symbols) reader, err := apply(rec, f) if err != nil { os.Stderr.WriteString("Error while constructing application reader. " + err.Error()) os.Exit(1) } var buf bytes.Buffer _, err = io.Copy(&buf, reader) if err != nil { os.Stderr.WriteString("Error while applying recommendations. " + err.Error()) os.Exit(1) } var in *bytes.Reader = bytes.NewReader(buf.Bytes()) rev, err := apply(rec.reverse(), in) if err != nil { os.Stderr.WriteString("Error while constructing application reader. " + err.Error()) os.Exit(1) } _, err = io.Copy(os.Stdout, rev) if err != nil { os.Stderr.WriteString("Error while applying recommendations. " + err.Error()) os.Exit(1) } } type recommendation struct { p2s map[uint16]byte s2p map[byte]uint16 } // Produces a reversed recommendation struct func (r *recommendation) reverse() *recommendation { var rec recommendation rec.p2s = make(map[uint16]byte) for k, v := range r.s2p { rec.p2s[v] = k } rec.s2p = make(map[byte]uint16) for k, v := range r.p2s { rec.s2p[v] = k } return &rec } // Returns an io.Reader that reads from the underlying one while applying the given recommendations func apply(rec *recommendation, reader io.Reader) (io.Reader, error) { // The symbol reader replaces symbols with pairs according to the s2p mapping symbolReader := iou.SizedReader(iou.ReaderFunc(func(output []byte) (int, error) { var ( i int = 0 err error ) for ; i < len(output)-1 && err == nil; i++ { // Read a byte from the underlying reader count, err := reader.Read(output[i : i+1]) // If we can't read anything else - return immediatelly if count == 0 { return i, err } // Convert the byte to a pair if there is a mapping for it if p, ok := rec.s2p[output[i]]; ok { output[i] = byte(p >> 8) // extract the high byte from the pair i++ output[i] = byte(p) // leave only the low byte from the pair } } return i, nil }), 2) currentByte, err := iou.ReadByte(symbolReader) if err != nil { return nil, err } // The pair reader replaces pairs with symbols according to the p2s mapping pairReader := iou.ReaderFunc(func(output []byte) (int, error) { var ( total int err error ) start: if len(output) <= 1 || err != nil { return total, err } nextByte, err := iou.ReadByte(symbolReader) if s, ok := rec.p2s[(uint16(currentByte)<<8 | uint16(nextByte))]; ok { output[0] = s currentByte, err = iou.ReadByte(symbolReader) } else { output[0] = currentByte currentByte = nextByte } output, total = output[1:], total+1 goto start }) return pairReader, nil } func recommend(pairs pairSlice, symbols symbolSlice) *recommendation { var rec recommendation rec.p2s = make(map[uint16]byte) // Store pair to symbol mappings rec.s2p = make(map[byte]uint16) // Store symbol to pair mappings for i, pairsLength := 0, len(pairs); i < pairsLength; i++ { currentPair := pairs[i] // Termination condition for when we are out of symbols if len(symbols) == 1 { // TODO drop to zero ? break } gain := currentPair.count - 4 // 4 bytes for the default header currentSymbol := symbols[0] if currentSymbol.count == 0 { // Termination condition for possitive compression effect if gain <= 0 { break } // Mark the recommendation rec.p2s[currentPair.value] = currentSymbol.value } else { // if the current symbol is present in the data // Decrease the gain by a symbol -> p`, p -> symbol replacement gain = gain - 2 - currentSymbol.count // 2 more bytes for header + replacement cost // Termination condition for possitive compression effect if gain <= 0 { break } // Mark this symbol for replacement by the last unused pair rec.s2p[currentSymbol.value] = pairs[pairsLength-1].value pairsLength-- // Mark the current pair for replacement by the current symbol rec.p2s[currentPair.value] = currentSymbol.value } symbols = symbols[1:] } return &rec } // Reads the provided input and returns information about the available byte pair and used symbols func analyze(reader io.Reader) (pairSlice, symbolSlice) { var ( current uint16 // Stores a pair of bytes in it's high and low bits buffer []byte = make([]byte, 1) pairs []uint64 = make([]uint64, 65536) // all possible pairs, 512kb symbols []uint64 = make([]uint64, 256) // all possible characters, 2kb ) // Read the first byte and store in the low bits of the current pair if c, err := reader.Read(buffer); err != nil || c != 1 { os.Stderr.WriteString("Error reading input.") os.Exit(1) } current = uint16(buffer[0]) // Read all of the data and note the counts of bytes and byte pairs io.Copy(iou.WriterFunc(func(data []byte) (int, error) { for _, value := range data { // Store pairs frequency current <<= 8 // Shift the previous byte from low to high current |= uint16(value) // Add the current byte to low pairs[current]++ // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) // Extract and sort all byte pairs availablePairs := make(pairSlice, 0) for index, value := range pairs { availablePairs = append(availablePairs, pair{value: uint16(index), count: value}) } sort.Sort(availablePairs) // Extract and sort all symbols (including the ones with zero counts) allSymbols := make(symbolSlice, 0) for index, value := range symbols { allSymbols = append(allSymbols, symbol{value: byte(index), count: value}) } sort.Sort(allSymbols) return availablePairs, allSymbols } type pair struct { value uint16 count uint64 } type symbol struct { value byte count uint64 } // Implements fmt.Stringer, used for debugging func (p pair) String() string { return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count) } type pairSlice []pair func (s pairSlice) Len() int { return len(s) } func (s pairSlice) Less(i, j int) bool { // Sort in descending order return s[i].count > s[j].count } func (s pairSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } type symbolSlice []symbol func (s symbolSlice) Len() int { return len(s) } func (s symbolSlice) Less(i, j int) bool { // Sort in ascending order return s[i].count < s[j].count } func (s symbolSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |
Modified src/0dev.org/ioutil/ioutil.go from [c11fdb5a53] to [83fc64874b].
︙ | ︙ | |||
16 17 18 19 20 21 22 23 24 25 26 27 28 29 | // An function alias type that implements io.Reader. type ReaderFunc func([]byte) (int, error) // Delegates the call to the WriterFunc while implementing io.Reader. func (r ReaderFunc) Read(b []byte) (int, error) { return r(b) } // Returns a writer that delegates calls to Write(...) while ensuring // that it is never called with less bytes than the specified amount. // // Calls with fewer bytes are buffered while a call with a nil slice // causes the buffer to be flushed to the underlying writer. func SizedWriter(writer io.Writer, size int) io.Writer { | > > > > > > > > > > | 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | // An function alias type that implements io.Reader. type ReaderFunc func([]byte) (int, error) // Delegates the call to the WriterFunc while implementing io.Reader. func (r ReaderFunc) Read(b []byte) (int, error) { return r(b) } // Reads a single byte from the provided io.Reader func ReadByte(reader io.Reader) (byte, error) { var ( arr [1]byte err error ) _, err = reader.Read(arr[:]) return arr[0], err } // Returns a writer that delegates calls to Write(...) while ensuring // that it is never called with less bytes than the specified amount. // // Calls with fewer bytes are buffered while a call with a nil slice // causes the buffer to be flushed to the underlying writer. func SizedWriter(writer io.Writer, size int) io.Writer { |
︙ | ︙ |
Modified src/0dev.org/ioutil/ioutil_test.go from [91aaac4713] to [514a112f00].
︙ | ︙ | |||
43 44 45 46 47 48 49 50 51 52 53 54 55 56 | // Diff the result against the initial input delta := diff.Diff(diff.D{len(input), len(output), func(i, j int) bool { return input[i] == output[j] }}) if len(delta.Added) > 0 || len(delta.Removed) > 0 { t.Error("Differences detected ", delta) } } func TestSizedWriter(t *testing.T) { var ( buffer bytes.Buffer writer io.Writer = SizedWriter(&buffer, 4) ) | > > > > > > > > > > > > > > > > > > > > | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | // Diff the result against the initial input delta := diff.Diff(diff.D{len(input), len(output), func(i, j int) bool { return input[i] == output[j] }}) if len(delta.Added) > 0 || len(delta.Removed) > 0 { t.Error("Differences detected ", delta) } } func TestReadByte(t *testing.T) { var ( input []byte = []byte{255} reader *bytes.Reader = bytes.NewReader(input) ) result, err := ReadByte(reader) if result != input[0] { t.Error("Unexpected read result from ReadByte", result) } if err != nil { t.Error("Unexpected error from ReadByte", err) } result, err = ReadByte(reader) if err != io.EOF { t.Error("Unexpected nil error from ReadByte, read value:", result) } } func TestSizedWriter(t *testing.T) { var ( buffer bytes.Buffer writer io.Writer = SizedWriter(&buffer, 4) ) |
︙ | ︙ |