ADDED src/0dev.org/commands/short/main.go Index: src/0dev.org/commands/short/main.go ================================================================== --- src/0dev.org/commands/short/main.go +++ src/0dev.org/commands/short/main.go @@ -0,0 +1,189 @@ +package main + +import ( + iou "0dev.org/ioutil" + "fmt" + "io" + "os" + "sort" +) + +func main() { + f, err := os.Open(os.Args[1]) + if err != nil { + os.Stderr.WriteString("Unable to open input file. " + err.Error()) + os.Exit(1) + } + + pairs, symbols := analyze(f) + + rec := recommend(pairs, symbols) + fmt.Println(*rec) +} + +type recommendation struct { + p2s map[uint16]byte + s2p map[byte]uint16 +} + +func apply(rec *recommendation, reader io.Reader) { + symbolReader := iou.SizedReader(iou.ReaderFunc(func(output []byte) (int, error) { + for i := 0; i < len(output)-1; i++ { + // Read a byte from the underlying reader + count, err := reader.Read(output[i : i+1]) + if err != nil || count != 1 { + return i + count, err + } + + // Convert the byte to a pair if there is a mapping for it + if pair, ok := rec.s2p[output[i]]; ok { + output[i] = byte(pair >> 8) // extract the high byte from the pair + i++ + output[i] = byte(pair) // leave only the low byte from the pair + } + } + }), 2) +} + +func recommend(pairs pairSlice, symbols symbolSlice) *recommendation { + var ( + rec recommendation + pairsLength = len(pairs) + ) + + rec.p2s = make(map[uint16]byte) // Store pair to symbol mappings + rec.s2p = make(map[byte]uint16) // Store symbol to pair mappings + + for i := 0; i < pairsLength; i++ { + currentPair := pairs[i] + + // Termination condition for when we are out of symbols + if len(symbols) == 0 { + break + } + + gain := currentPair.count - 4 // 4 bytes for the default header + currentSymbol := symbols[0] + + if currentSymbol.count == 0 { + // Termination condition for possitive compression effect + if gain <= 0 { + break + } + + // Mark the recommendation and proceed with the next pair + rec.p2s[currentPair.value] = currentSymbol.value + continue + } else { // if the current symbol is present in the data + // Decrease the gain by a symbol -> p`, p -> symbol replacement + gain -= 2 // Additional 2 bytes for the more complex header + gain -= currentSymbol.count // Account for swaping the symbol to a pair in order to free it + + // Termination condition for possitive compression effect + if gain <= 0 { + break + } + + // Mark this symbol for replacement by the last unused pair + rec.s2p[currentSymbol.value] = pairs[pairsLength-1].value + pairsLength-- + + // Mark the current pair for replacement by the current symbol + rec.p2s[currentPair.value] = currentSymbol.value + } + } + + return &rec +} + +// Reads the provided input and returns information about the available byte pair and used symbols +func analyze(reader io.Reader) (pairSlice, symbolSlice) { + var ( + current uint16 // Stores a pair of bytes in it's high and low bits + buffer []byte = make([]byte, 1) + pairs []uint64 = make([]uint64, 65536) // all possible pairs, 512kb + symbols []uint64 = make([]uint64, 256) // all possible characters, 2kb + ) + + // Read the first byte and store in the low bits of the current pair + if c, err := reader.Read(buffer); err != nil || c != 1 { + os.Stderr.WriteString("Error reading input.") + os.Exit(1) + } + current = uint16(buffer[0]) + + // Read all of the data and note the counts of bytes and byte pairs + io.Copy(iou.WriterFunc(func(data []byte) (int, error) { + for _, value := range data { + // Store pairs frequency + current <<= 8 // Shift the previous byte from low to high + current |= uint16(value) // Add the current byte to low + pairs[current]++ + + // Store bytes frequency + symbols[value]++ + } + return len(data), nil + }), reader) + + // Extract and sort all byte pairs + availablePairs := make(pairSlice, 0) + for index, value := range pairs { + availablePairs = append(availablePairs, pair{value: uint16(index), count: value}) + } + sort.Sort(availablePairs) + + // Extract and sort all symbols (including the ones with zero counts) + allSymbols := make(symbolSlice, 0) + for index, value := range symbols { + allSymbols = append(allSymbols, symbol{value: byte(index), count: value}) + } + sort.Sort(allSymbols) + + return availablePairs, allSymbols +} + +type pair struct { + value uint16 + count uint64 +} + +type symbol struct { + value byte + count uint64 +} + +// Implements fmt.Stringer, used for debugging +func (p pair) String() string { + return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count) +} + +type pairSlice []pair + +func (s pairSlice) Len() int { + return len(s) +} + +func (s pairSlice) Less(i, j int) bool { + // Sort in descending order + return s[i].count > s[j].count +} + +func (s pairSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} + +type symbolSlice []symbol + +func (s symbolSlice) Len() int { + return len(s) +} + +func (s symbolSlice) Less(i, j int) bool { + // Sort in ascending order + return s[i].count < s[j].count +} + +func (s symbolSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +}