Check-in [7f9a25c94d]
Overview
SHA1:7f9a25c94d8384dee4b6defc9cd5fa1f190a648a
Date: 2014-12-28 11:33:12
User: spaskalev
Comment:Adding initial pieces of a prototype byte pair encoder
Timelines: family | ancestors | descendants | both | bpe
Downloads: Tarball | ZIP archive
Other Links: files | file ages | folders | manifest
Tags And Properties
Context
2014-12-28
12:44
[a64b9a1ac3] Adding swaps recommendation for the bpe. (user: spaskalev, tags: bpe)
11:33
[7f9a25c94d] Adding initial pieces of a prototype byte pair encoder (user: spaskalev, tags: bpe)
2014-12-26
21:35
[129d90b4a8] Added 0dev.org/types, providing aliases that implement sort.Interface for [u]int{8|16|32|64} (user: spaskalev, tags: trunk)
Changes

Added src/0dev.org/commands/short/main.go version [4473925a61].

            1  +package main
            2  +
            3  +import (
            4  +	iou "0dev.org/ioutil"
            5  +	"fmt"
            6  +	"io"
            7  +	"os"
            8  +	"sort"
            9  +)
           10  +
           11  +func main() {
           12  +	f, err := os.Open(os.Args[1])
           13  +	if err != nil {
           14  +		os.Stderr.WriteString("Unable to open input file. " + err.Error())
           15  +		os.Exit(1)
           16  +	}
           17  +
           18  +	pairs, symbols := analyze(f)
           19  +	fmt.Println(pairs)
           20  +	fmt.Println(symbols)
           21  +}
           22  +
           23  +// Reads the provided input and returns information about the available byte pair and used symbols
           24  +func analyze(reader io.Reader) (pairSlice, symbolSlice) {
           25  +	var (
           26  +		current uint16   // Stores a pair of bytes in it's high and low bits
           27  +		buffer  []byte   = make([]byte, 1)
           28  +		pairs   []uint64 = make([]uint64, 65536) // all possible pairs, 512kb
           29  +		symbols []uint64 = make([]uint64, 256)   // all possible characters, 2kb
           30  +	)
           31  +
           32  +	// Read the first byte and store in the low bits of the current pair
           33  +	if c, err := reader.Read(buffer); err != nil || c != 1 {
           34  +		os.Stderr.WriteString("Error reading input.")
           35  +		os.Exit(1)
           36  +	}
           37  +	current = uint16(buffer[0])
           38  +
           39  +	// Read all of the data and note the counts of bytes and byte pairs
           40  +	io.Copy(iou.WriterFunc(func(data []byte) (int, error) {
           41  +		for _, value := range data {
           42  +			// Store pairs frequency
           43  +			current <<= 8            // Shift the previous byte from low to high
           44  +			current |= uint16(value) // Add the current byte to low
           45  +			pairs[current]++
           46  +
           47  +			// Store bytes frequency
           48  +			symbols[value]++
           49  +		}
           50  +		return len(data), nil
           51  +	}), reader)
           52  +
           53  +	// Extract and sort all available byte pairs
           54  +	availablePairs := make(pairSlice, 0)
           55  +	for index, value := range pairs {
           56  +		if value > 0 {
           57  +			availablePairs = append(availablePairs, pair{value: uint16(index), count: value})
           58  +		}
           59  +	}
           60  +	sort.Sort(availablePairs)
           61  +
           62  +	// Extract and sort all symbols (including the ones with zero counts)
           63  +	allSymbols := make(symbolSlice, 0)
           64  +	for index, value := range symbols {
           65  +		allSymbols = append(allSymbols, symbol{value: byte(index), count: value})
           66  +	}
           67  +	sort.Sort(allSymbols)
           68  +
           69  +	return availablePairs, allSymbols
           70  +}
           71  +
           72  +type pair struct {
           73  +	value uint16
           74  +	count uint64
           75  +}
           76  +
           77  +// Implements fmt.Stringer, used for debugging
           78  +func (p pair) String() string {
           79  +	return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count)
           80  +}
           81  +
           82  +type pairSlice []pair
           83  +
           84  +func (s pairSlice) Len() int {
           85  +	return len(s)
           86  +}
           87  +
           88  +func (s pairSlice) Less(i, j int) bool {
           89  +	// Sort in descending order
           90  +	return s[i].count > s[j].count
           91  +}
           92  +
           93  +func (s pairSlice) Swap(i, j int) {
           94  +	s[i], s[j] = s[j], s[i]
           95  +}
           96  +
           97  +type symbol struct {
           98  +	value byte
           99  +	count uint64
          100  +}
          101  +
          102  +type symbolSlice []symbol
          103  +
          104  +func (s symbolSlice) Len() int {
          105  +	return len(s)
          106  +}
          107  +
          108  +func (s symbolSlice) Less(i, j int) bool {
          109  +	// Sort in descending order
          110  +	return s[i].count > s[j].count
          111  +}
          112  +
          113  +func (s symbolSlice) Swap(i, j int) {
          114  +	s[i], s[j] = s[j], s[i]
          115  +}