Overview
Comment: | Adding initial pieces of a prototype byte pair encoder |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | bpe |
Files: | files | file ages | folders |
SHA1: |
7f9a25c94d8384dee4b6defc9cd5fa1f |
User & Date: | spaskalev on 2014-12-28 11:33:12 |
Other Links: | branch diff | manifest | tags |
Context
2014-12-28
| ||
12:44 | Adding swaps recommendation for the bpe. check-in: a64b9a1ac3 user: spaskalev tags: bpe | |
11:33 | Adding initial pieces of a prototype byte pair encoder check-in: 7f9a25c94d user: spaskalev tags: bpe | |
2014-12-26
| ||
21:35 | Added 0dev.org/types, providing aliases that implement sort.Interface for [u]int{8|16|32|64} check-in: 129d90b4a8 user: spaskalev tags: trunk | |
Changes
Added src/0dev.org/commands/short/main.go version [4473925a61].
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | package main import ( iou "0dev.org/ioutil" "fmt" "io" "os" "sort" ) func main() { f, err := os.Open(os.Args[1]) if err != nil { os.Stderr.WriteString("Unable to open input file. " + err.Error()) os.Exit(1) } pairs, symbols := analyze(f) fmt.Println(pairs) fmt.Println(symbols) } // Reads the provided input and returns information about the available byte pair and used symbols func analyze(reader io.Reader) (pairSlice, symbolSlice) { var ( current uint16 // Stores a pair of bytes in it's high and low bits buffer []byte = make([]byte, 1) pairs []uint64 = make([]uint64, 65536) // all possible pairs, 512kb symbols []uint64 = make([]uint64, 256) // all possible characters, 2kb ) // Read the first byte and store in the low bits of the current pair if c, err := reader.Read(buffer); err != nil || c != 1 { os.Stderr.WriteString("Error reading input.") os.Exit(1) } current = uint16(buffer[0]) // Read all of the data and note the counts of bytes and byte pairs io.Copy(iou.WriterFunc(func(data []byte) (int, error) { for _, value := range data { // Store pairs frequency current <<= 8 // Shift the previous byte from low to high current |= uint16(value) // Add the current byte to low pairs[current]++ // Store bytes frequency symbols[value]++ } return len(data), nil }), reader) // Extract and sort all available byte pairs availablePairs := make(pairSlice, 0) for index, value := range pairs { if value > 0 { availablePairs = append(availablePairs, pair{value: uint16(index), count: value}) } } sort.Sort(availablePairs) // Extract and sort all symbols (including the ones with zero counts) allSymbols := make(symbolSlice, 0) for index, value := range symbols { allSymbols = append(allSymbols, symbol{value: byte(index), count: value}) } sort.Sort(allSymbols) return availablePairs, allSymbols } type pair struct { value uint16 count uint64 } // Implements fmt.Stringer, used for debugging func (p pair) String() string { return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count) } type pairSlice []pair func (s pairSlice) Len() int { return len(s) } func (s pairSlice) Less(i, j int) bool { // Sort in descending order return s[i].count > s[j].count } func (s pairSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } type symbol struct { value byte count uint64 } type symbolSlice []symbol func (s symbolSlice) Len() int { return len(s) } func (s symbolSlice) Less(i, j int) bool { // Sort in descending order return s[i].count > s[j].count } func (s symbolSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |