Check-in [7f9a25c94d]
Overview
SHA1:7f9a25c94d8384dee4b6defc9cd5fa1f190a648a
Date: 2014-12-28 11:33:12
User: spaskalev
Comment:Adding initial pieces of a prototype byte pair encoder
Timelines: family | ancestors | descendants | both | bpe
Downloads: Tarball | ZIP archive
Other Links: files | file ages | folders | manifest
Tags And Properties
Context
2014-12-28
12:44
[a64b9a1ac3] Adding swaps recommendation for the bpe. (user: spaskalev, tags: bpe)
11:33
[7f9a25c94d] Adding initial pieces of a prototype byte pair encoder (user: spaskalev, tags: bpe)
2014-12-26
21:35
[129d90b4a8] Added 0dev.org/types, providing aliases that implement sort.Interface for [u]int{8|16|32|64} (user: spaskalev, tags: trunk)
Changes

Added src/0dev.org/commands/short/main.go version [4473925a61].







































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package main

import (
	iou "0dev.org/ioutil"
	"fmt"
	"io"
	"os"
	"sort"
)

func main() {
	f, err := os.Open(os.Args[1])
	if err != nil {
		os.Stderr.WriteString("Unable to open input file. " + err.Error())
		os.Exit(1)
	}

	pairs, symbols := analyze(f)
	fmt.Println(pairs)
	fmt.Println(symbols)
}

// Reads the provided input and returns information about the available byte pair and used symbols
func analyze(reader io.Reader) (pairSlice, symbolSlice) {
	var (
		current uint16   // Stores a pair of bytes in it's high and low bits
		buffer  []byte   = make([]byte, 1)
		pairs   []uint64 = make([]uint64, 65536) // all possible pairs, 512kb
		symbols []uint64 = make([]uint64, 256)   // all possible characters, 2kb
	)

	// Read the first byte and store in the low bits of the current pair
	if c, err := reader.Read(buffer); err != nil || c != 1 {
		os.Stderr.WriteString("Error reading input.")
		os.Exit(1)
	}
	current = uint16(buffer[0])

	// Read all of the data and note the counts of bytes and byte pairs
	io.Copy(iou.WriterFunc(func(data []byte) (int, error) {
		for _, value := range data {
			// Store pairs frequency
			current <<= 8            // Shift the previous byte from low to high
			current |= uint16(value) // Add the current byte to low
			pairs[current]++

			// Store bytes frequency
			symbols[value]++
		}
		return len(data), nil
	}), reader)

	// Extract and sort all available byte pairs
	availablePairs := make(pairSlice, 0)
	for index, value := range pairs {
		if value > 0 {
			availablePairs = append(availablePairs, pair{value: uint16(index), count: value})
		}
	}
	sort.Sort(availablePairs)

	// Extract and sort all symbols (including the ones with zero counts)
	allSymbols := make(symbolSlice, 0)
	for index, value := range symbols {
		allSymbols = append(allSymbols, symbol{value: byte(index), count: value})
	}
	sort.Sort(allSymbols)

	return availablePairs, allSymbols
}

type pair struct {
	value uint16
	count uint64
}

// Implements fmt.Stringer, used for debugging
func (p pair) String() string {
	return fmt.Sprintf("[ %d %d (%d) ]", (p.value >> 8), ((p.value << 8) >> 8), p.count)
}

type pairSlice []pair

func (s pairSlice) Len() int {
	return len(s)
}

func (s pairSlice) Less(i, j int) bool {
	// Sort in descending order
	return s[i].count > s[j].count
}

func (s pairSlice) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
}

type symbol struct {
	value byte
	count uint64
}

type symbolSlice []symbol

func (s symbolSlice) Len() int {
	return len(s)
}

func (s symbolSlice) Less(i, j int) bool {
	// Sort in descending order
	return s[i].count > s[j].count
}

func (s symbolSlice) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
}