Check-in [f996b5e7f0]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Added Nakamichi
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f996b5e7f000295df2ac3622d83965a418892b35
User & Date: m 2014-04-17 18:17:06
Context
2014-04-27
15:50
Updated Nakamichi to round 5; Updated TODOs check-in: 5b6b454d6a user: m tags: trunk
2014-04-17
18:17
Added Nakamichi check-in: f996b5e7f0 user: m tags: trunk
2014-04-10
17:10
Added crush check-in: 2d4af53f63 user: m tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/CHANGELOG.

1
2
3
4

5
6
7
8
9
10
11
0.15
[+] added memcpy, memmove codecs
[+] added crush
[+] added zling

[~] updated lz4 to r114
[~] updated LZJB to the latest FreeBSD version
[~] updated tornado to 0.6a
[-] removed xxhash256, it's dangerously weak
[!] fixed integer underflow
[!] fixed a segfault with nop codec





>







1
2
3
4
5
6
7
8
9
10
11
12
0.15
[+] added memcpy, memmove codecs
[+] added crush
[+] added zling
[+] added Nakamichi
[~] updated lz4 to r114
[~] updated LZJB to the latest FreeBSD version
[~] updated tornado to 0.6a
[-] removed xxhash256, it's dangerously weak
[!] fixed integer underflow
[!] fixed a segfault with nop codec

Changes to src/CMakeLists.txt.

33
34
35
36
37
38
39

40
41
42
43
44
45
46
set(USE_LZV1              0) # inefficient
set(USE_LZWC              0) # inefficient
set(USE_LZX_COMPRESS      0) # inefficient, no decompressor
set(USE_MINIHUFF          1)
set(USE_MINIZ             1)
set(USE_MMINI             0) # doesn't compile with MSVC10
set(USE_MURMUR            1)

set(USE_NRV               0) # inefficient
set(USE_QUICKLZ           1)
set(USE_QUICKLZZIP        0) # quite good, but single-threaded. I may fix it one day.
set(USE_RLE64             1)
set(USE_SANMAYCE_FNV      1)
set(USE_SCZ               0) # single threaded and far too inefficient to be worth fixing...
set(USE_SHA3_RND1         0) # didn't make it to the final...







>







33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
set(USE_LZV1              0) # inefficient
set(USE_LZWC              0) # inefficient
set(USE_LZX_COMPRESS      0) # inefficient, no decompressor
set(USE_MINIHUFF          1)
set(USE_MINIZ             1)
set(USE_MMINI             0) # doesn't compile with MSVC10
set(USE_MURMUR            1)
set(USE_NAKAMICHI         0) # inefficient
set(USE_NRV               0) # inefficient
set(USE_QUICKLZ           1)
set(USE_QUICKLZZIP        0) # quite good, but single-threaded. I may fix it one day.
set(USE_RLE64             1)
set(USE_SANMAYCE_FNV      1)
set(USE_SCZ               0) # single threaded and far too inefficient to be worth fixing...
set(USE_SHA3_RND1         0) # didn't make it to the final...
460
461
462
463
464
465
466




467
468
469
470
471
472
473
    set(OTHER_SRC ${OTHER_SRC} codecs/mmini/huffman.c codecs/mmini/lzl.c)
    add_definitions(-DFSBENCH_USE_MMINI)
ENDIF(USE_MMINI)
IF(USE_MURMUR)
    set(OTHER_SRC ${OTHER_SRC} codecs/MurmurHash3.cpp)
    add_definitions(-DFSBENCH_USE_MURMUR)
ENDIF(USE_MURMUR)




IF(USE_QUICKLZ)
    include_directories (${FSBENCH_SOURCE_DIR}/codecs/quicklz) 
    set(OTHER_SRC ${OTHER_SRC} codecs/quicklz/quicklz1.c codecs/quicklz/quicklz2.c codecs/quicklz/quicklz3.c )
    add_definitions(-DFSBENCH_USE_QUICKLZ)
ENDIF(USE_QUICKLZ)
IF(USE_QUICKLZZIP)
    include_directories (${FSBENCH_SOURCE_DIR}/codecs/quicklz) 







>
>
>
>







461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
    set(OTHER_SRC ${OTHER_SRC} codecs/mmini/huffman.c codecs/mmini/lzl.c)
    add_definitions(-DFSBENCH_USE_MMINI)
ENDIF(USE_MMINI)
IF(USE_MURMUR)
    set(OTHER_SRC ${OTHER_SRC} codecs/MurmurHash3.cpp)
    add_definitions(-DFSBENCH_USE_MURMUR)
ENDIF(USE_MURMUR)
IF(USE_NAKAMICHI)
    set(OTHER_SRC ${OTHER_SRC} codecs/Nakamichi.c)
    add_definitions(-DFSBENCH_USE_NAKAMICHI)
ENDIF(USE_NAKAMICHI)
IF(USE_QUICKLZ)
    include_directories (${FSBENCH_SOURCE_DIR}/codecs/quicklz) 
    set(OTHER_SRC ${OTHER_SRC} codecs/quicklz/quicklz1.c codecs/quicklz/quicklz2.c codecs/quicklz/quicklz3.c )
    add_definitions(-DFSBENCH_USE_QUICKLZ)
ENDIF(USE_QUICKLZ)
IF(USE_QUICKLZZIP)
    include_directories (${FSBENCH_SOURCE_DIR}/codecs/quicklz) 

Changes to src/README.

173
174
175
176
177
178
179

180
181
182
183
184
185
186
mmini - lzl                         | Adam Ierymenko                                   | 2010            | no       | https://code.google.com/p/mmini
LZMAT                               | Vitaly Evseenko                                  | 1.1             | yes      | www.matcode.com/lzmat.htm
LZO                                 | Markus Franz Xaver Johannes Oberhumer            | 2.06            | yes      | www.oberhumer.com/opensource/lzo
LZSS                                | Ilia Muraviev                                    | 2008-07-31      | no       | http://encode.ru/threads/143-LZSS-v0-01-is-here!
LZV1                                | Hermann Vogt                                     | 0.5             | no       | http://encode.ru/threads/1661-LZWC-A-fast-tree-based-LZW-compressor
lzx_compress                        | Matthew T. Russotto                              | 2005-07-06      | no       |
miniz                               | Richard Geldreich, Jr.                           | 1.11            | yes      | https://code.google.com/p/miniz

nrv2a                               | Markus Franz Xaver Johannes Oberhumer            | 1.03            | no       | www.oberhumer.com/opensource/ucl
nrv2b                               | Markus Franz Xaver Johannes Oberhumer            | 1.03            | no       | www.oberhumer.com/opensource/ucl
nrv2d                               | Markus Franz Xaver Johannes Oberhumer            | 1.03            | no       | www.oberhumer.com/opensource/ucl
QuickLZ                             | Lasse Mikkel Reinhold                            | 1.5.1b6         | yes      | www.quicklz.com
QuickLZ zip                         | Lasse Mikkel Reinhold                            | 0.4             | no       | www.quicklz.com/zip.html
RLE64                               | Javier GutiƩrrez Chamorro                        | R3.00           | yes      | http://nikkhokkho.sourceforge.net/static.php?page=RLE64
SCZ                                 | Carl Kindman                                     | 2008-11-25      | no       |







>







173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
mmini - lzl                         | Adam Ierymenko                                   | 2010            | no       | https://code.google.com/p/mmini
LZMAT                               | Vitaly Evseenko                                  | 1.1             | yes      | www.matcode.com/lzmat.htm
LZO                                 | Markus Franz Xaver Johannes Oberhumer            | 2.06            | yes      | www.oberhumer.com/opensource/lzo
LZSS                                | Ilia Muraviev                                    | 2008-07-31      | no       | http://encode.ru/threads/143-LZSS-v0-01-is-here!
LZV1                                | Hermann Vogt                                     | 0.5             | no       | http://encode.ru/threads/1661-LZWC-A-fast-tree-based-LZW-compressor
lzx_compress                        | Matthew T. Russotto                              | 2005-07-06      | no       |
miniz                               | Richard Geldreich, Jr.                           | 1.11            | yes      | https://code.google.com/p/miniz
Nakamichi                           | Georgi Marinov                                   | r1              | no       | http://www.codeproject.com/Articles/250566/Fastest-strstr-like-function-in-C?msg=4800986#xx4800986xx
nrv2a                               | Markus Franz Xaver Johannes Oberhumer            | 1.03            | no       | www.oberhumer.com/opensource/ucl
nrv2b                               | Markus Franz Xaver Johannes Oberhumer            | 1.03            | no       | www.oberhumer.com/opensource/ucl
nrv2d                               | Markus Franz Xaver Johannes Oberhumer            | 1.03            | no       | www.oberhumer.com/opensource/ucl
QuickLZ                             | Lasse Mikkel Reinhold                            | 1.5.1b6         | yes      | www.quicklz.com
QuickLZ zip                         | Lasse Mikkel Reinhold                            | 0.4             | no       | www.quicklz.com/zip.html
RLE64                               | Javier GutiƩrrez Chamorro                        | R3.00           | yes      | http://nikkhokkho.sourceforge.net/static.php?page=RLE64
SCZ                                 | Carl Kindman                                     | 2008-11-25      | no       |

Changes to src/codecs.cpp.

257
258
259
260
261
262
263



264
265
266
267
268
269
270
#ifdef FSBENCH_USE_LZWC
              new Codec("LZWC", _LZWC_VERSION, LZWC_c, LZWC_d, no_blowup),
#endif
#ifdef FSBENCH_USE_MMINI
              new BufferedCodec("mmini_huffman", _MMINI_VERSION, mmini_huffman_c, mmini_huffman_d, no_blowup, MMINI_HUFFHEAP_SIZE),
              new Codec("mmini_lzl", _MMINI_VERSION, mmini_lzl_c, mmini_lzl_d, no_blowup),
#endif



#ifdef FSBENCH_USE_QUICKLZZIP
              new Codec("QuickLZ-zip", _QLZZIP_VERSION, qlzzip_c, 0),
#endif
#ifdef FSBENCH_USE_SHRINKER
              new Codec("Shrinker", _SHRINKER_VERSION, Shrinker_c, Shrinker_d, Shrinker_m),
#endif
#ifdef FSBENCH_USE_SNAPPY







>
>
>







257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#ifdef FSBENCH_USE_LZWC
              new Codec("LZWC", _LZWC_VERSION, LZWC_c, LZWC_d, no_blowup),
#endif
#ifdef FSBENCH_USE_MMINI
              new BufferedCodec("mmini_huffman", _MMINI_VERSION, mmini_huffman_c, mmini_huffman_d, no_blowup, MMINI_HUFFHEAP_SIZE),
              new Codec("mmini_lzl", _MMINI_VERSION, mmini_lzl_c, mmini_lzl_d, no_blowup),
#endif
#ifdef FSBENCH_USE_NAKAMICHI
              new Codec("Nakamichi", _NAKAMICHI_VERSION, nakamichi_c, nakamichi_d),
#endif
#ifdef FSBENCH_USE_QUICKLZZIP
              new Codec("QuickLZ-zip", _QLZZIP_VERSION, qlzzip_c, 0),
#endif
#ifdef FSBENCH_USE_SHRINKER
              new Codec("Shrinker", _SHRINKER_VERSION, Shrinker_c, Shrinker_d, Shrinker_m),
#endif
#ifdef FSBENCH_USE_SNAPPY
510
511
512
513
514
515
516

517
518
519
520
521
522
523
      make_pair(raw_find_codec("LZSS-IM"), ""),
      make_pair(raw_find_codec("lzv1"), ""),
      make_pair(raw_find_codec("lzwc"), ""),
      make_pair(find_codec("lzx_compress/nop"), ""),
      make_pair(raw_find_codec("miniz"), ""),
      make_pair(raw_find_codec("mmini_huffman"), ""),
      make_pair(raw_find_codec("mmini_lzl"), ""),

      make_pair(raw_find_codec("nrv2b"), ""),
      make_pair(raw_find_codec("nrv2d"), ""),
      make_pair(raw_find_codec("nrv2e"), ""),
      make_pair(raw_find_codec("QuickLZ"), ""),
      make_pair(find_codec("QuickLZ-zip/zlib"), ""),
      make_pair(raw_find_codec("RLE64"), ""),
      make_pair(raw_find_codec("scz"), ""),







>







513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
      make_pair(raw_find_codec("LZSS-IM"), ""),
      make_pair(raw_find_codec("lzv1"), ""),
      make_pair(raw_find_codec("lzwc"), ""),
      make_pair(find_codec("lzx_compress/nop"), ""),
      make_pair(raw_find_codec("miniz"), ""),
      make_pair(raw_find_codec("mmini_huffman"), ""),
      make_pair(raw_find_codec("mmini_lzl"), ""),
      make_pair(raw_find_codec("Nakamichi"), ""),
      make_pair(raw_find_codec("nrv2b"), ""),
      make_pair(raw_find_codec("nrv2d"), ""),
      make_pair(raw_find_codec("nrv2e"), ""),
      make_pair(raw_find_codec("QuickLZ"), ""),
      make_pair(find_codec("QuickLZ-zip/zlib"), ""),
      make_pair(raw_find_codec("RLE64"), ""),
      make_pair(raw_find_codec("scz"), ""),

Changes to src/codecs.hpp.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/**
 * Written by m^2.
 * You can consider the code to be public domain.
 * If your country doesn't recognize author's right to relieve themselves of copyright,
 * you can use it under the terms of WTFPL version 2.0 or later.
 *
 *
 * To add a codec:
 * 1. Implement a Codec class for it or - if applicable - functions required by one of abstract ones
 * 2. Add sources to CMakeFiles.txt
 * 3. Add codec version to codecs.hpp
 * 4. Add codec to CODECS array in codecs.cpp
 * 5. Add codec to all_compressors / all_checksums / all_ciphers in codecs.cpp
 * 6. Update readme
 * 7. Update changelog
 */
#ifndef CODECS_HPP_BhjgkfG8
#define CODECS_HPP_BhjgkfG8

#include <list>
#include <exception>
#include <string>











<
|
|
|







1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
/**
 * Written by m^2.
 * You can consider the code to be public domain.
 * If your country doesn't recognize author's right to relieve themselves of copyright,
 * you can use it under the terms of WTFPL version 2.0 or later.
 *
 *
 * To add a codec:
 * 1. Implement a Codec class for it or - if applicable - functions required by one of abstract ones
 * 2. Add sources to CMakeFiles.txt
 * 3. Add codec version to codecs.hpp

 * 4. Add codec to all_compressors / all_checksums / all_ciphers in codecs.cpp
 * 5. Update readme
 * 6. Update changelog
 */
#ifndef CODECS_HPP_BhjgkfG8
#define CODECS_HPP_BhjgkfG8

#include <list>
#include <exception>
#include <string>
199
200
201
202
203
204
205

206
207
208
209
210
211
212
#define _LZSSIM_VERSION     "2008-07-31"
#define _LZV1_VERSION       "0.5"
#define _LZWC_VERSION       "0.4"
#define _LZX_VERSION        "2005-07-06"
#define _MINIZ_VERSION      "1.11"
#define _MMINI_VERSION      "2012-12-23"
#define _MURMUR_VERSION     "2012-02-29"

#define _NRV_VERSION        "1.03"
#define _QLZ_VERSION        "1.5.1b6"
#define _QLZZIP_VERSION     "0.4"
#define _RLE64_VERSION      "R3.00"
#define _SANMAYCE_VERSION   "2013-06-16"
#define _SIPHASH_VERSION    "reference"
#define _SCZ_VERSION        "2008-11-25"







>







198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#define _LZSSIM_VERSION     "2008-07-31"
#define _LZV1_VERSION       "0.5"
#define _LZWC_VERSION       "0.4"
#define _LZX_VERSION        "2005-07-06"
#define _MINIZ_VERSION      "1.11"
#define _MMINI_VERSION      "2012-12-23"
#define _MURMUR_VERSION     "2012-02-29"
#define _NAKAMICHI_VERSION  "r1"
#define _NRV_VERSION        "1.03"
#define _QLZ_VERSION        "1.5.1b6"
#define _QLZZIP_VERSION     "0.4"
#define _RLE64_VERSION      "R3.00"
#define _SANMAYCE_VERSION   "2013-06-16"
#define _SIPHASH_VERSION    "reference"
#define _SCZ_VERSION        "2008-11-25"

Added src/codecs/Nakamichi.c.





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
// Nakamichi, revision 1-RSSBO, written by Kaze.
// Based on Nobuo Ito's source, thanks Ito.
// The main goal of Nakamichi is to allow supersimple and superfast decoding for English x-grams (mainly) in pure C, or not, heh-heh.
// Natively Nakamichi is targeted as 64bit tool with 16 threads, helping Kazahana to traverse faster when I/O is not superior.
// In short, Nakamichi is intended as x-gram decompressor.

// Eightfold Path ~ the Buddhist path to nirvana, comprising eight aspects in which an aspirant must become practised; 
// eightfold way ~ (Physics), the grouping of hadrons into supermultiplets by means of SU(3)); (b) adverb to eight times the number or quantity: OE.

// Note1: Fifteenth bit is not used, making the window wider by 1bit i.e. 32KB is not tempting, rather I think to use it as a flag: 8bytes/16bytes.
// Note2: English x-grams are as English texts but more redundant, in other words they are phraselists in most cases, sometimes wordlists.
// Note3: On OSHO.TXT, being a typical English text, Nakamichi's compression ratio is among the worsts:
//        206,908,949 OSHO.TXT
//        125,022,859 OSHO.TXT.Nakamichi
//        It struggles with English texts but decomprression speed is quite sweet (Core 2 T7500 2200MHz, 32bit code):
//        Nakamichi, revision 1-, written by Kaze.
//        Decompressing 125022859 bytes ...
//        RAM-to-RAM performance: 477681 KB/s.      
// Note4: Also I wanted to see how my 'Railgun_Swampshine_BailOut', being a HEAVYGUN i.e. with big overhead and latency, hits in a real-world application.

// Quick notes on PAGODAs (the padded x-gram lists):
// Every single word in English has its own PAGODA, in example below 'on' PAGODA is given (Kazahana_on.PAGODA-order-5.txt):
// PAGODA order 5 (i.e. with 5 tiers) has 5*(5+1)/2=15 subtiers, they are concatenated and space-padded in order to form the pillar 'on':
/*
D:\_KAZE\Nakamichi_r1-RSSBO>dir \_GW\ka*

04/12/2014  05:07 AM                14 Kazahana_on.1-1.txt
04/12/2014  05:07 AM         1,635,389 Kazahana_on.2-1.txt
04/12/2014  05:07 AM         1,906,734 Kazahana_on.2-2.txt
04/12/2014  05:07 AM        10,891,415 Kazahana_on.3-1.txt
04/12/2014  05:07 AM        15,797,703 Kazahana_on.3-2.txt
04/12/2014  05:07 AM        20,419,280 Kazahana_on.3-3.txt
04/12/2014  05:07 AM        22,141,823 Kazahana_on.4-1.txt
04/12/2014  05:07 AM        36,002,113 Kazahana_on.4-2.txt
04/12/2014  05:07 AM        33,236,772 Kazahana_on.4-3.txt
04/12/2014  05:07 AM        33,902,425 Kazahana_on.4-4.txt
04/12/2014  05:07 AM        24,795,989 Kazahana_on.5-1.txt
04/12/2014  05:07 AM        30,766,220 Kazahana_on.5-2.txt
04/12/2014  05:07 AM        38,982,816 Kazahana_on.5-3.txt
04/12/2014  05:07 AM        38,089,575 Kazahana_on.5-4.txt
04/12/2014  05:07 AM        34,309,057 Kazahana_on.5-5.txt
04/12/2014  05:07 AM       846,351,894 Kazahana_on.PAGODA-order-5.txt

D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.1-1.txt
9,999,999       on

D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.2-1.txt
9,999,999       on_the
1,148,054       on_his
0,559,694       on_her
0,487,856       on_this
0,399,485       on_your
0,381,570       on_my
0,367,282       on_their
...

D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.2-2.txt
0,545,191       based_on
0,397,408       and_on
0,334,266       go_on
0,329,561       went_on
0,263,035       was_on
0,246,332       it_on
0,229,041       down_on
0,202,151       going_on
...

D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.5-5.txt
0,083,564       foundation_osho_s_books_on
0,012,404       medium_it_may_be_on
0,012,354       if_you_received_it_on
0,012,152       medium_they_may_be_on
0,012,144       agree_to_also_provide_on
0,012,139       a_united_states_copyright_on
0,008,067       we_are_constantly_working_on
0,008,067       questions_we_have_received_on
0,006,847       file_was_first_posted_on
0,006,441       of_we_are_already_on
0,006,279       you_received_this_ebook_on
0,005,865       you_received_this_etext_on
0,005,833       to_keep_an_eye_on
...

D:\_KAZE\Nakamichi_r1-RSSBO>dir

04/12/2014  05:07 AM       846,351,894 Kazahana_on.PAGODA-order-5.txt

D:\_KAZE\Nakamichi_r1-RSSBO>Nakamichi.exe Kazahana_on.PAGODA-order-5.txt
Nakamichi, revision 1-RSSBO, written by Kaze.
Compressing 846351894 bytes ...
/; Each rotation means 128KB are encoded; Done 100%
RAM-to-RAM performance: 512 KB/s.

D:\_KAZE\Nakamichi_r1-RSSBO>dir

04/12/2014  05:07 AM       846,351,894 Kazahana_on.PAGODA-order-5.txt
04/15/2014  06:30 PM       293,049,398 Kazahana_on.PAGODA-order-5.txt.Nakamichi

D:\_KAZE\Nakamichi_r1-RSSBO>Nakamichi.exe Kazahana_on.PAGODA-order-5.txt.Nakamichi
Nakamichi, revision 1-RSSBO, written by Kaze.
Decompressing 293049398 bytes ...
RAM-to-RAM performance: 607 MB/s.

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 4096
YAPPY: [b 4K] bytes 846351894 -> 191149889  22.6%  comp  33.8 MB/s  uncomp 875.4 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 8192
YAPPY: [b 8K] bytes 846351894 -> 184153244  21.8%  comp  35.0 MB/s  uncomp 898.3 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 16384
YAPPY: [b 16K] bytes 846351894 -> 180650931  21.3%  comp  28.8 MB/s  uncomp 906.4 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 32768
YAPPY: [b 32K] bytes 846351894 -> 178902966  21.1%  comp  35.0 MB/s  uncomp 906.4 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 65536
YAPPY: [b 64K] bytes 846351894 -> 178027899  21.0%  comp  34.5 MB/s  uncomp 914.6 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 131072
YAPPY: [b 128K] bytes 846351894 -> 177591807  21.0%  comp  34.9 MB/s  uncomp 906.4 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>
*/

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> // uint64_t needed
#include <time.h>

//#include <emmintrin.h> // SSE2 intrinsics
//#include <smmintrin.h> // SSE4.1 intrinsics
//#include <immintrin.h> // AVX intrinsics

//void SlowCopy128bit (const char *SOURCE, char *TARGET) { _mm_storeu_si128((__m128i *)(TARGET), _mm_loadu_si128((const __m128i *)(SOURCE))); }

#ifndef NULL
#define NULL ((void*)0)
#endif

// Comment it to see how slower 'BruteForce' is, for Wikipedia 100MB the ratio is 41KB/s versus 197KB/s.
#define ReplaceBruteForceWithRailgunSwampshineBailOut

void SearchIntoSlidingWindow(unsigned int* retIndex, unsigned int* retMatch, char* refStart,char* refEnd,char* encStart,char* encEnd);
unsigned int SlidingWindowVsLookAheadBuffer(char* refStart, char* refEnd, char* encStart, char* encEnd);
unsigned int Compress(char* ret, char* src, unsigned int srcSize);
unsigned int Decompress(char* ret, char* src, unsigned int srcSize);
char * Railgun_Swampshine_BailOut(char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern);

// Min_Match_Length=THRESHOLD=4 means 4 and bigger are to be encoded:
#define Min_Match_BAILOUT_Length (8)
#define Min_Match_Length (8)
#define OffsetBITS (14)
#define LengthBITS (1)

//12bit
//#define REF_SIZE (4095+Min_Match_Length)
#define REF_SIZE ( ((1<<OffsetBITS)-1) + Min_Match_Length )
//3bit
//#define ENC_SIZE (7+Min_Match_Length)
#define ENC_SIZE ( ((1<<LengthBITS)-1) + Min_Match_Length )
/*
int main( int argc, char *argv[] ) {
	FILE *fp;
	int SourceSize;
	int TargetSize;
	char* SourceBlock=NULL;
	char* TargetBlock=NULL;
	char* Nakamichi = ".Nakamichi\0";
	char NewFileName[256];
	clock_t clocks1, clocks2;

	printf("Nakamichi, revision 1-RSSBO, written by Kaze.\n");
	if (argc==1) {
		printf("Usage: Nakamichi filename\n"); exit(13);
	}
	if ((fp = fopen(argv[1], "rb")) == NULL) {
		printf("Nakamichi: Can't open '%s' file.\n", argv[1]); exit(13);
	}
	fseek(fp, 0, SEEK_END);
	SourceSize = ftell(fp);
	fseek(fp, 0, SEEK_SET);
	// If filename ends in '.Nakamichi' then mode is decompression otherwise compression.
	if (strcmp(argv[1]+(strlen(argv[1])-strlen(Nakamichi)), Nakamichi) == 0) {
	SourceBlock = (char*)malloc(SourceSize+512);
	TargetBlock = (char*)malloc(5*SourceSize+512);
	fread(SourceBlock, 1, SourceSize, fp);
	fclose(fp);
		printf("Decompressing %d bytes ...\n", SourceSize );
		clocks1 = clock();
		TargetSize = Decompress(TargetBlock, SourceBlock, SourceSize);
		clocks2 = clock();
		printf("RAM-to-RAM performance: %d MB/s.\n", ((TargetSize/(clocks2 - clocks1 + 1))*(long)1000)>>20);
		strcpy(NewFileName, argv[1]);
		*( NewFileName + strlen(argv[1])-strlen(Nakamichi) ) = '\0';
	} else {
	SourceBlock = (char*)malloc(SourceSize+512);
	TargetBlock = (char*)malloc(SourceSize+512);
	fread(SourceBlock, 1, SourceSize, fp);
	fclose(fp);
		printf("Compressing %d bytes ...\n", SourceSize );
		clocks1 = clock();
		TargetSize = Compress(TargetBlock, SourceBlock, SourceSize);
		clocks2 = clock();
		printf("RAM-to-RAM performance: %d KB/s.\n", ((SourceSize/(clocks2 - clocks1 + 1))*(long)1000)>>10);
		strcpy(NewFileName, argv[1]);
		strcat(NewFileName, Nakamichi);
	}
	if ((fp = fopen(NewFileName, "wb")) == NULL) {
		printf("Nakamichi: Can't write '%s' file.\n", NewFileName); exit(13);
	}
	fwrite(TargetBlock, 1, TargetSize, fp);
	fclose(fp);
	free(TargetBlock);
	free(SourceBlock);
	exit(0);
}
*/
void SearchIntoSlidingWindow(unsigned int* retIndex, unsigned int* retMatch, char* refStart,char* refEnd,char* encStart,char* encEnd){
	char* FoundAtPosition;
	unsigned int match=0;
	*retIndex=0;
	*retMatch=0;
#ifdef ReplaceBruteForceWithRailgunSwampshineBailOut
	if (refStart < refEnd) {
		FoundAtPosition = Railgun_Swampshine_BailOut(refStart, encStart, (uint32_t)(refEnd-refStart), 8);
		if (FoundAtPosition!=NULL) {
			*retMatch=8;
			*retIndex=refEnd-FoundAtPosition;
		}
	}
#else				
	while(refStart < refEnd){
		match=SlidingWindowVsLookAheadBuffer(refStart,refEnd,encStart,encEnd);
		if(match > *retMatch){
			*retMatch=match;
			*retIndex=refEnd-refStart;
		}
		if(*retMatch >= Min_Match_BAILOUT_Length) break;
		refStart++;
	}
#endif
}

unsigned int SlidingWindowVsLookAheadBuffer( char* refStart, char* refEnd, char* encStart,char* encEnd){
	int ret = 0;
	while(refStart[ret] == encStart[ret]){
		if(&refStart[ret] >= refEnd) break;
		if(&encStart[ret] >= encEnd) break;
		ret++;
		if(ret >= Min_Match_BAILOUT_Length) break;
	}
	return ret;
}

unsigned int Compress(char* ret, char* src, unsigned int srcSize){
	unsigned int srcIndex=0;
	unsigned int retIndex=0;
	unsigned int index=0;
	unsigned int match=0;
	unsigned int notMatch=0;
	char* notMatchStart=NULL;
	char* refStart=NULL;
	char* encEnd=NULL;
	int Melnitchka=0;
	char *Auberge[4] = {"|\0","/\0","-\0","\\\0"};
	int ProgressIndicator;

	while(srcIndex < srcSize){
		if(srcIndex>=REF_SIZE)
			refStart=&src[srcIndex-REF_SIZE];
		else
			refStart=src;
		if(srcIndex>=srcSize-ENC_SIZE)
			encEnd=&src[srcSize];
		else
			encEnd=&src[srcIndex+ENC_SIZE];

		SearchIntoSlidingWindow(&index,&match,refStart,&src[srcIndex],&src[srcIndex],encEnd);
		//if ( match<Min_Match_Length ) {
		//if ( match<Min_Match_Length || match<8 ) {
		if ( match<8 ) {
			if(notMatch==0){
				notMatchStart=&ret[retIndex];
				retIndex++;
			}
			else if (notMatch==127) {
				*notMatchStart=127;
				notMatch=0;
				notMatchStart=&ret[retIndex];
				retIndex++;
			}
			ret[retIndex]=src[srcIndex];
			retIndex++;
			notMatch++;
			srcIndex++;
		} else {
			match = 8;
			if(notMatch > 0){
				*notMatchStart=notMatch;
				notMatch=0;
			}
			ret[retIndex] = 0x80;
			// 1bit+3bits+12bits:
			//ret[retIndex] = ret[retIndex] | ((match-Min_Match_Length)<<4);
			//ret[retIndex] = ret[retIndex] | (((index-Min_Match_Length) & 0x0F00)>>8);
			// 1bit+1bit+14bits:
			ret[retIndex] = ret[retIndex] | ((match-Min_Match_Length)<<(8-(LengthBITS+1)));
			ret[retIndex] = ret[retIndex] | (((index-Min_Match_Length) & 0x3F00)>>8); // 2+4+8=14
			retIndex++;
			ret[retIndex] = (char)((index-Min_Match_Length) & 0x00FF);
			retIndex++;

			srcIndex+=match;
		}
		if (srcIndex % (1<<17) == 0) {
			ProgressIndicator = (int)( (srcIndex+1)*(float)100/(srcSize+1) );
			//printf("%s; Each rotation means 128KB are encoded; Done %d%%\r", Auberge[Melnitchka++], ProgressIndicator );
			Melnitchka = Melnitchka & 3; // 0 1 2 3: 00 01 10 11
		}
	}
	if(notMatch > 0){
		*notMatchStart=notMatch;
	}
	//printf("%s; Each rotation means 128KB are encoded; Done %d%%\n", Auberge[Melnitchka], 100 );
	return retIndex;
}

unsigned int Decompress(char* ret, char* src, unsigned int srcSize){
	unsigned int srcIndex=0;
	unsigned int retIndex=0;
	unsigned int index=0;
	unsigned int match=0;

	while(srcIndex < srcSize){
		if((unsigned char)src[srcIndex] <= 127){
			memcpy(&ret[retIndex],&src[srcIndex+1],src[srcIndex]); // Use padding and replace 'memcpy' with loop of 4 or 4+4 transfers/stores i.e. *()=DWORD
			retIndex+=src[srcIndex];
			srcIndex+=(src[srcIndex]+1);
		}
		else{
			// 1bit+3bits+12bits:
			//match = ((src[srcIndex] & 0x7F) >> 4)+Min_Match_Length;
			//index = (src[srcIndex] & 0x0F) << 8;
			// 1bit+1bit+14bits:
			//match = ((src[srcIndex] & 0x4F) >> 4)+Min_Match_Length; // In fact, not needed when eightfoldness is commenced, match is 8.
			match=8; // or 16 in next revision.
			index = (src[srcIndex] & 0x3F) << 8;
			srcIndex++;
			index = (index | (unsigned int)(0x00FF & src[srcIndex])) + Min_Match_Length;
			srcIndex++;
			//memcpy(&ret[retIndex],&ret[retIndex-index],match); // Replace 'memcpy' with 4 or 4+4 transfer/store i.e. *()=DWORD
				//*(uint32_t*)(ret+retIndex) = *(uint32_t*)(ret+retIndex-index);
				//*(uint32_t*)(ret+retIndex+4) = *(uint32_t*)(ret+retIndex-index+4);
					//*(uint64_t*)(ret+retIndex) = *(uint64_t*)(ret+retIndex-index);
			*(uint64_t*)(ret+retIndex) = *(uint64_t*)(ret+retIndex-index);
			retIndex+=match;
		}
	}
	return retIndex;
}

// Decompression main loop, 84-2e+2=88 bytes long:
/*
; mark_description "Intel(R) C++ Compiler XE for applications running on IA-32, Version 12.1.1.258 Build 20111011";
; mark_description "-O3 -FAcs";
.B5.3:                          
  0002e 8d 14 1f         lea edx, DWORD PTR [edi+ebx]           
  00031 0f be 0a         movsx ecx, BYTE PTR [edx]              
  00034 0f b6 c1         movzx eax, cl                          
  00037 83 f8 7f         cmp eax, 127                           
  0003a 7e 27            jle .B5.5 
.B5.4:                          
  0003c 83 e1 3f         and ecx, 63                            
  0003f 83 c3 02         add ebx, 2                             
  00042 c1 e1 08         shl ecx, 8                             
  00045 0f b6 42 01      movzx eax, BYTE PTR [1+edx]            
  00049 0b c8            or ecx, eax                            
  0004b f7 d9            neg ecx                                
  0004d 8d 54 35 00      lea edx, DWORD PTR [ebp+esi]           
  00051 83 c6 08         add esi, 8                             
  00054 8b 44 11 f8      mov eax, DWORD PTR [-8+ecx+edx]        
  00058 8b 4c 11 fc      mov ecx, DWORD PTR [-4+ecx+edx]        
  0005c 89 02            mov DWORD PTR [edx], eax               
  0005e 89 4a 04         mov DWORD PTR [4+edx], ecx             
  00061 eb 1d            jmp .B5.7 
.B5.5:                          
  00063 51               push ecx                               
  00064 8d 44 1f 01      lea eax, DWORD PTR [1+edi+ebx]         
  00068 50               push eax                               
  00069 8d 54 35 00      lea edx, DWORD PTR [ebp+esi]           
  0006d 52               push edx                               
  0006e e8 fc ff ff ff   call __intel_fast_memcpy               
.B5.12:                         
  00073 83 c4 0c         add esp, 12                            
.B5.6:                          
  00076 0f be 0c 3b      movsx ecx, BYTE PTR [ebx+edi]          
  0007a 03 f1            add esi, ecx                           
  0007c 8d 5c 0b 01      lea ebx, DWORD PTR [1+ebx+ecx]         
.B5.7:                          
  00080 3b 5c 24 1c      cmp ebx, DWORD PTR [28+esp]            
  00084 72 a8            jb .B5.3 
*/

// The Speed Showdown on my 'Bonboniera' laptop (Core 2 T7500 2200MHz, Windows 7 64bit):
// Grrr, in round #1 Yappy kicked my ass, yet, 4 more rounds remain...
/*
D:\_KAZE\Nakamichi_r1-RSSBO>NakamichiVsYappy.bat
Speed Showdown: Nakamichi VS Yappy, both compiled with Intel v12.1 ...

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 1024
YAPPY: [b 1K] bytes 104857600 -> 78039768  74.4%  comp  42.7 MB/s  uncomp 680.3 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 2048
YAPPY: [b 2K] bytes 104857600 -> 70845249  67.6%  comp  39.8 MB/s  uncomp 623.3 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 4096
YAPPY: [b 4K] bytes 104857600 -> 64270963  61.3%  comp  36.2 MB/s  uncomp 583.1 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 8192
YAPPY: [b 8K] bytes 104857600 -> 59756354  57.0%  comp  32.9 MB/s  uncomp 549.5 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 16384
YAPPY: [b 16K] bytes 104857600 -> 57497885  54.8%  comp  31.4 MB/s  uncomp 541.5 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 32768
YAPPY: [b 32K] bytes 104857600 -> 56365696  53.8%  comp  31.1 MB/s  uncomp 541.5 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 65536
YAPPY: [b 64K] bytes 104857600 -> 55799079  53.2%  comp  31.3 MB/s  uncomp 534.3 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe enwiki-20140304-pages-articles.7z.001 262144
YAPPY: [b 256K] bytes 104857600 -> 55377127  52.8%  comp  31.4 MB/s  uncomp 534.3 MB/s

D:\_KAZE\Nakamichi_r1-RSSBO>Nakamichi.exe enwiki-20140304-pages-articles.7z.001
Nakamichi, revision 1-RSSBO, written by Kaze.
Compressing 104857600 bytes ...
-; Each rotation means 128KB are encoded; Done 100%
RAM-to-RAM performance: 198 KB/s.

D:\_KAZE\Nakamichi_r1-RSSBO>Nakamichi.exe enwiki-20140304-pages-articles.7z.001.Nakamichi
Nakamichi, revision 1-RSSBO, written by Kaze.
Decompressing 70533827 bytes ...
RAM-to-RAM performance: 531 MB/s.

D:\_KAZE\Nakamichi_r1-RSSBO>
*/

// In my opinion Hamid Buzidi is the best, therefore his lzturbo v1.1 reference results are given below:
/*
D:\_KAZE\Nakamichi_r1-RSSBO>timer32 lzturbo.exe -19 -p0 enwiki-20140304-pages-articles.7z.001 .

Kernel  Time =     0.982 =    0%
User    Time =   152.537 =   99%
Process Time =   153.520 =  100%    Virtual  Memory =    429 MB
Global  Time =   153.519 =  100%    Physical Memory =    407 MB

D:\_KAZE\Nakamichi_r1-RSSBO>timer32.exe lzturbo.exe -d enwiki-20140304-pages-articles.7z.001.lzt .

Kernel  Time =     0.234 =   62%
User    Time =     0.187 =   50%
Process Time =     0.421 =  112%    Virtual  Memory =     98 MB
Global  Time =     0.374 =  100%    Physical Memory =     70 MB

D:\_KAZE\Nakamichi_r1-RSSBO>dir

04/15/2014  08:05 AM       104,857,600 enwiki-20140304-pages-articles.7z.001
04/15/2014  08:04 AM        41,984,881 enwiki-20140304-pages-articles.7z.001.lzt

D:\_KAZE\Nakamichi_r1-RSSBO>timer32 lzturbo.exe -11 -p0 enwiki-20140304-pages-articles.7z.001 .

Kernel  Time =     0.171 =    9%
User    Time =     1.622 =   90%
Process Time =     1.794 =  100%    Virtual  Memory =     58 MB
Global  Time =     1.794 =  100%    Physical Memory =     39 MB

D:\_KAZE\Nakamichi_r1-RSSBO>timer32.exe lzturbo.exe -d enwiki-20140304-pages-articles.7z.001.lzt .

Kernel  Time =     0.249 =   41%
User    Time =     0.140 =   23%
Process Time =     0.390 =   64%    Virtual  Memory =     98 MB
Global  Time =     0.608 =  100%    Physical Memory =     73 MB

D:\_KAZE\Nakamichi_r1-RSSBO>dir

04/15/2014  08:05 AM       104,857,600 enwiki-20140304-pages-articles.7z.001
04/15/2014  08:05 AM        47,685,453 enwiki-20140304-pages-articles.7z.001.lzt

D:\_KAZE\Nakamichi_r1-RSSBO>
*/

// Railgun_Swampshine_BailOut, copyleft 2014-Jan-31, Kaze.
// Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char.
#define NeedleThreshold2vs4swampLITE 9+10 // Should be bigger than 9. BMH2 works up to this value (inclusive), if bigger then BMH4 takes over.
char * Railgun_Swampshine_BailOut (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern)
{
	char * pbTargetMax = pbTarget + cbTarget;
	register uint32_t ulHashPattern;
	signed long count;

	unsigned char bm_Horspool_Order2[256*256]; // Bitwise soon...
	uint32_t i, Gulliver;

	uint32_t PRIMALposition, PRIMALpositionCANDIDATE;
	uint32_t PRIMALlength, PRIMALlengthCANDIDATE;
	uint32_t j, FoundAtPosition;

	if (cbPattern > cbTarget) return(NULL);

	if ( cbPattern<4 ) { 
		// SSE2 i.e. 128bit Assembly rules here:
		// ...
        	pbTarget = pbTarget+cbPattern;
		ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1));
		if ( cbPattern==3 ) {
			for ( ;; ) {
				if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) {
					if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3));
				}
				if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) { 
					pbTarget++;
					if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++;
				}
				pbTarget++;
				if (pbTarget > pbTargetMax) return(NULL);
			}
		} else {
		}
		for ( ;; ) {
			if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2));
			if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++;
			pbTarget++;
			if (pbTarget > pbTargetMax) return(NULL);
		}
	} else { //if ( cbPattern<4 )
		if ( cbPattern<=NeedleThreshold2vs4swampLITE ) { 
			// BMH order 2, needle should be >=4:
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;}
			for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=1;
			i=0;
			while (i <= cbTarget-cbPattern) {
				Gulliver = 1; // 'Gulliver' is the skip
				if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
					if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else {
						if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
							count = cbPattern-4+1; 
							while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) )
								count = count-4;
							if ( count <= 0 ) return(pbTarget+i);
						}
					}
				} else Gulliver = cbPattern-(2-1);
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);
		} else { // if ( cbPattern<=NeedleThreshold2vs4swampLITE )

// Swampwalker_BAILOUT heuristic order 4 (Needle should be bigger than 4) [
// Needle: 1234567890qwertyuiopasdfghjklzxcv            PRIMALposition=01 PRIMALlength=33  '1234567890qwertyuiopasdfghjklzxcv'
// Needle: vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv            PRIMALposition=29 PRIMALlength=04  'vvvv'
// Needle: vvvvvvvvvvBOOMSHAKALAKAvvvvvvvvvv            PRIMALposition=08 PRIMALlength=20  'vvvBOOMSHAKALAKAvvvv'
// Needle: Trollland                                    PRIMALposition=01 PRIMALlength=09  'Trollland'
// Needle: Swampwalker                                  PRIMALposition=01 PRIMALlength=11  'Swampwalker'
// Needle: licenselessness                              PRIMALposition=01 PRIMALlength=15  'licenselessness'
// Needle: alfalfa                                      PRIMALposition=02 PRIMALlength=06  'lfalfa'
// Needle: Sandokan                                     PRIMALposition=01 PRIMALlength=08  'Sandokan'
// Needle: shazamish                                    PRIMALposition=01 PRIMALlength=09  'shazamish'
// Needle: Simplicius Simplicissimus                    PRIMALposition=06 PRIMALlength=20  'icius Simplicissimus'
// Needle: domilliaquadringenquattuorquinquagintillion  PRIMALposition=01 PRIMALlength=32  'domilliaquadringenquattuorquinqu'
// Needle: boom-boom                                    PRIMALposition=02 PRIMALlength=08  'oom-boom'
// Needle: vvvvv                                        PRIMALposition=01 PRIMALlength=04  'vvvv'
// Needle: 12345                                        PRIMALposition=01 PRIMALlength=05  '12345'
// Needle: likey-likey                                  PRIMALposition=03 PRIMALlength=09  'key-likey'
// Needle: BOOOOOM                                      PRIMALposition=03 PRIMALlength=05  'OOOOM'
// Needle: aaaaaBOOOOOM                                 PRIMALposition=02 PRIMALlength=09  'aaaaBOOOO'
// Needle: BOOOOOMaaaaa                                 PRIMALposition=03 PRIMALlength=09  'OOOOMaaaa'
PRIMALlength=0;
for (i=0+(1); i < cbPattern-((4)-1)+(1)-(1); i++) { // -(1) because the last BB order 4 has no counterpart(s)
	FoundAtPosition = cbPattern - ((4)-1) + 1;
	PRIMALpositionCANDIDATE=i;
	while ( PRIMALpositionCANDIDATE <= (FoundAtPosition-1) ) {
		j = PRIMALpositionCANDIDATE + 1;
		while ( j <= (FoundAtPosition-1) ) {
			if ( *(uint32_t *)(pbPattern+PRIMALpositionCANDIDATE-(1)) == *(uint32_t *)(pbPattern+j-(1)) ) FoundAtPosition = j;
			j++;
		}
		PRIMALpositionCANDIDATE++;
	}
	PRIMALlengthCANDIDATE = (FoundAtPosition-1)-i+1 +((4)-1);
	if (PRIMALlengthCANDIDATE >= PRIMALlength) {PRIMALposition=i; PRIMALlength = PRIMALlengthCANDIDATE;}
	if (cbPattern-i+1 <= PRIMALlength) break;
	if (PRIMALlength > 128) break; // Bail Out for 129[+]
}
// Swampwalker_BAILOUT heuristic order 4 (Needle should be bigger than 4) ]

// Here we have 4 or bigger NewNeedle, apply order 2 for pbPattern[i+(PRIMALposition-1)] with length 'PRIMALlength' and compare the pbPattern[i] with length 'cbPattern':
PRIMALlengthCANDIDATE = cbPattern;
cbPattern = PRIMALlength;
pbPattern = pbPattern + (PRIMALposition-1);

// Revision 2 commented section [
/*
if (cbPattern-1 <= 255) {
// BMH Order 2 [
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized
			for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=i; // Rightmost appearance/position is needed
			i=0;
			while (i <= cbTarget-cbPattern) { 
				Gulliver = bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]];
				if ( Gulliver != cbPattern-1 ) { // CASE #2: if equal means the pair (char order 2) is not found i.e. Gulliver remains intact, skip the whole pattern and fall back (Order-1) chars i.e. one char for Order 2
				if ( Gulliver == cbPattern-2 ) { // CASE #1: means the pair (char order 2) is found
					if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
						count = cbPattern-4+1; 
						while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) )
							count = count-4;
// If we miss to hit then no need to compare the original: Needle
if ( count <= 0 ) {
// I have to add out-of-range checks...
// i-(PRIMALposition-1) >= 0
// &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4
// i-(PRIMALposition-1)+(count-1) >= 0
// &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4
	if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) {
		if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
			count = PRIMALlengthCANDIDATE-4+1; 
			while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) )
				count = count-4;
			if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1));	
		}
	}
}
					}
					Gulliver = 1;
				} else
					Gulliver = cbPattern - Gulliver - 2; // CASE #3: the pair is found and not as suffix i.e. rightmost position
				}
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);
// BMH Order 2 ]
} else {
			// BMH order 2, needle should be >=4:
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;}
			for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=1;
			i=0;
			while (i <= cbTarget-cbPattern) {
				Gulliver = 1; // 'Gulliver' is the skip
				if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
					if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else {
						if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
							count = cbPattern-4+1; 
							while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) )
								count = count-4;
// If we miss to hit then no need to compare the original: Needle
if ( count <= 0 ) {
// I have to add out-of-range checks...
// i-(PRIMALposition-1) >= 0
// &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4
// i-(PRIMALposition-1)+(count-1) >= 0
// &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4
	if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) {
		if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
			count = PRIMALlengthCANDIDATE-4+1; 
			while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) )
				count = count-4;
			if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1));	
		}
	}
}
						}
					}
				} else Gulliver = cbPattern-(2-1);
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);
}
*/
// Revision 2 commented section ]

		if ( cbPattern<=NeedleThreshold2vs4swampLITE ) { 

			// BMH order 2, needle should be >=4:
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;}
			for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=1;
			i=0;
			while (i <= cbTarget-cbPattern) {
				Gulliver = 1; // 'Gulliver' is the skip
				if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) {
					if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else {
						if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
							count = cbPattern-4+1; 
							while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) )
								count = count-4;
// If we miss to hit then no need to compare the original: Needle
if ( count <= 0 ) {
// I have to add out-of-range checks...
// i-(PRIMALposition-1) >= 0
// &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4
// i-(PRIMALposition-1)+(count-1) >= 0
// &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4
	if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) {
		if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
			count = PRIMALlengthCANDIDATE-4+1; 
			while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) )
				count = count-4;
			if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1));	
		}
	}
}
						}
					}
				} else Gulliver = cbPattern-(2-1);
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);

		} else { // if ( cbPattern<=NeedleThreshold2vs4swampLITE )

			// BMH pseudo-order 4, needle should be >=8+2:
			ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes
			for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;}
			// In line below we "hash" 4bytes to 2bytes i.e. 16bit table, how to compute TOTAL number of BBs, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8:
			//"fast"
			//"aste"
			//"stes"
			//"test"
			//"est "
			//"st f"
			//"t fo"
			//" fox"
			//for (i=0; i < cbPattern-4+1; i++) bm_Horspool_Order2[( *(unsigned short *)(pbPattern+i+0) + *(unsigned short *)(pbPattern+i+2) ) & ( (1<<16)-1 )]=1;
			//for (i=0; i < cbPattern-4+1; i++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+i+0)>>16)+(*(uint32_t *)(pbPattern+i+0)&0xFFFF) ) & ( (1<<16)-1 )]=1;
			// Above line is replaced by next one with better hashing:
			for (i=0; i < cbPattern-4+1; i++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+i+0)>>(16-1))+(*(uint32_t *)(pbPattern+i+0)&0xFFFF) ) & ( (1<<16)-1 )]=1;
			i=0;
			while (i <= cbTarget-cbPattern) {
				Gulliver = 1;
				//if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFF) ) & ( (1<<16)-1 )] != 0 ) { // DWORD #1
				// Above line is replaced by next one with better hashing:
				if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFF) ) & ( (1<<16)-1 )] != 0 ) { // DWORD #1
					//if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = cbPattern-(2-1)-2-4; else {
					// Above line is replaced in order to strengthen the skip by checking the middle DWORD,if the two DWORDs are 'ab' and 'cd' i.e. [2x][2a][2b][2c][2d] then the middle DWORD is 'bc'.
					// The respective offsets (backwards) are: -10/-8/-6/-4 for 'xa'/'ab'/'bc'/'cd'.
					//if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) ) & ( (1<<16)-1 )] ) < 3 ) Gulliver = cbPattern-(2-1)-2-4-2; else {
					// Above line is replaced by next one with better hashing:
					// When using (16-1) right shifting instead of 16 we will have two different pairs (if they are equal), the highest bit being lost do the job especialy for ASCII texts with no symbols in range 128-255.
					// Example for genomesque pair TT+TT being shifted by (16-1):
					// T            = 01010100
					// TT           = 01010100 01010100
					// TTTT         = 01010100 01010100 01010100 01010100
					// TTTT>>16     = 00000000 00000000 01010100 01010100
					// TTTT>>(16-1) = 00000000 00000000 10101000 10101000 <--- Due to the left shift by 1, the 8th bits of 1st and 2nd bytes are populated - usually they are 0 for English texts & 'ACGT' data.
					//if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) ) & ( (1<<16)-1 )] ) < 3 ) Gulliver = cbPattern-(2-1)-2-4-2; else {
					// 'Maximus' uses branched 'if', again.
					if ( \
					( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6 +1]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6 +1]&0xFFFF) ) & ( (1<<16)-1 )] ) == 0 \
					|| ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4 +1]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4 +1]&0xFFFF) ) & ( (1<<16)-1 )] ) == 0 \
					) Gulliver = cbPattern-(2-1)-2-4-2 +1; else {
					// Above line is not optimized (several a SHR are used), we have 5 non-overlapping WORDs, or 3 overlapping WORDs, within 4 overlapping DWORDs so:
// [2x][2a][2b][2c][2d]
// DWORD #4
// [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]>>16) =     !SHR to be avoided! <--
// [2x] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) =                        |
//     DWORD #3                                                                       |
// [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16) =     !SHR to be avoided!   |<--
// [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) = ------------------------  |
//         DWORD #2                                                                      |
// [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]>>16) =     !SHR to be avoided!      |<--
// [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) = ---------------------------  |
//             DWORD #1                                                                     |
// [2d] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]>>16) =                                 |
// [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF) = ------------------------------
//
// So in order to remove 3 SHR instructions the equal extractions are:
// DWORD #4
// [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) =  !SHR to be avoided! <--
// [2x] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) =                        |
//     DWORD #3                                                                       |
// [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) =  !SHR to be avoided!   |<--
// [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) = ------------------------  |
//         DWORD #2                                                                      |
// [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF) =  !SHR to be avoided!      |<--
// [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) = ---------------------------  |
//             DWORD #1                                                                     |
// [2d] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]>>16) =                                 |
// [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF) = ------------------------------
					//if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) ) & ( (1<<16)-1 )] ) < 3 ) Gulliver = cbPattern-(2-1)-2-6; else {
// Since the above Decumanus mumbo-jumbo (3 overlapping lookups vs 2 non-overlapping lookups) is not fast enough we go DuoDecumanus or 3x4:
// [2y][2x][2a][2b][2c][2d]
// DWORD #3
//         DWORD #2
//                 DWORD #1
					//if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-8]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-8]&0xFFFF) ) & ( (1<<16)-1 )] ) < 2 ) Gulliver = cbPattern-(2-1)-2-8; else {
						if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) {
							// Order 4 [
						// Let's try something "outrageous" like comparing with[out] overlap BBs 4bytes long instead of 1 byte back-to-back:
						// Inhere we are using order 4, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8:
						//0:"fast" if the comparison failed here, 'count' is 1; 'Gulliver' is cbPattern-(4-1)-7
						//1:"aste" if the comparison failed here, 'count' is 2; 'Gulliver' is cbPattern-(4-1)-6
						//2:"stes" if the comparison failed here, 'count' is 3; 'Gulliver' is cbPattern-(4-1)-5
						//3:"test" if the comparison failed here, 'count' is 4; 'Gulliver' is cbPattern-(4-1)-4
						//4:"est " if the comparison failed here, 'count' is 5; 'Gulliver' is cbPattern-(4-1)-3
						//5:"st f" if the comparison failed here, 'count' is 6; 'Gulliver' is cbPattern-(4-1)-2
						//6:"t fo" if the comparison failed here, 'count' is 7; 'Gulliver' is cbPattern-(4-1)-1
						//7:" fox" if the comparison failed here, 'count' is 8; 'Gulliver' is cbPattern-(4-1)
							count = cbPattern-4+1; 
							// Below comparison is UNIdirectional:
							while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) )
								count = count-4;
// count = cbPattern-4+1 = 23-4+1 = 20
// boomshakalakaZZZZZZ[ZZZZ] 20
// boomshakalakaZZ[ZZZZ]ZZZZ 20-4
// boomshakala[kaZZ]ZZZZZZZZ 20-8 = 12
// boomsha[kala]kaZZZZZZZZZZ 20-12 = 8
// boo[msha]kalakaZZZZZZZZZZ 20-16 = 4

// If we miss to hit then no need to compare the original: Needle
if ( count <= 0 ) {
// I have to add out-of-range checks...
// i-(PRIMALposition-1) >= 0
// &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4
// i-(PRIMALposition-1)+(count-1) >= 0
// &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4
	if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) {
		if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below:
			count = PRIMALlengthCANDIDATE-4+1; 
			while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) )
				count = count-4;
			if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1));	
		}
	}
}

							// In order to avoid only-left or only-right WCS the memcmp should be done as left-to-right and right-to-left AT THE SAME TIME.
							// Below comparison is BIdirectional. It pays off when needle is 8+++ long:
//							for (count = cbPattern-4+1; count > 0; count = count-4) {
//								if ( *(uint32_t *)(pbPattern+count-1) != *(uint32_t *)(&pbTarget[i]+(count-1)) ) {break;};
//								if ( *(uint32_t *)(pbPattern+(cbPattern-4+1)-count) != *(uint32_t *)(&pbTarget[i]+(cbPattern-4+1)-count) ) {count = (cbPattern-4+1)-count +(1); break;} // +(1) because two lookups are implemented as one, also no danger of 'count' being 0 because of the fast check outwith the 'while': if ( *(uint32_t *)&pbTarget[i] == ulHashPattern)
//							}
//							if ( count <= 0 ) return(pbTarget+i);
								// Checking the order 2 pairs in mismatched DWORD, all the 3:
								//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1]] == 0 ) Gulliver = count; // 1 or bigger, as it should
								//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1]] == 0 ) Gulliver = count+1; // 1 or bigger, as it should
								//if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1+1]] == 0 ) Gulliver = count+1+1; // 1 or bigger, as it should
							//	if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1]] + bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1]] + bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1+1]] < 3 ) Gulliver = count; // 1 or bigger, as it should, THE MIN(count,count+1,count+1+1)
								// Above compound 'if' guarantees not that Gulliver > 1, an example:
								// Needle:    fastest tax
								// Window: ...fastast tax...
								// After matching ' tax' vs ' tax' and 'fast' vs 'fast' the mismathced DWORD is 'test' vs 'tast':
								// 'tast' when factorized down to order 2 yields: 'ta','as','st' - all the three when summed give 1+1+1=3 i.e. Gulliver remains 1.
								// Roughly speaking, this attempt maybe has its place in worst-case scenarios but not in English text and even not in ACGT data, that's why I commented it in original 'Shockeroo'.
								//if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+count-1]>>16)+(*(uint32_t *)&pbTarget[i+count-1]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = count; // 1 or bigger, as it should
								// Above line is replaced by next one with better hashing:
//								if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+count-1]>>(16-1))+(*(uint32_t *)&pbTarget[i+count-1]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = count; // 1 or bigger, as it should
							// Order 4 ]
						}
					}
				} else Gulliver = cbPattern-(2-1)-2; // -2 because we check the 4 rightmost bytes not 2.
				i = i + Gulliver;
				//GlobalI++; // Comment it, it is only for stats.
			}
			return(NULL);

		} // if ( cbPattern<=NeedleThreshold2vs4swampLITE )
		} // if ( cbPattern<=NeedleThreshold2vs4swampLITE )
	} //if ( cbPattern<4 )
}

// Last change: 2014-Apr-14
// If you want to help me to improve it, email me at: sanmayce@sanmayce.com
// Enfun!

Changes to src/simple_codecs.cpp.

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
        char key[16] = {0};
        uhash_ctx_t ctx = uhash_alloc(key);
        // FIXME: what if malloc fails?
        ::uhash(ctx, in, isize, out);
        uhash_free(ctx);
        memcpy(in + isize, backup, sizeof(backup));
    }
    
    // TODO: uhash doesn't support blocks > 16 MB
    void umac(char * in, size_t isize, char * out)
    {
        // uhash may overwrite data after the input
        // That's where fsbench stores checksum
        // If this function is called during 'decoding',
        // there's a hash stored already that will be destroyed,







|







221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
        char key[16] = {0};
        uhash_ctx_t ctx = uhash_alloc(key);
        // FIXME: what if malloc fails?
        ::uhash(ctx, in, isize, out);
        uhash_free(ctx);
        memcpy(in + isize, backup, sizeof(backup));
    }

    // TODO: uhash doesn't support blocks > 16 MB
    void umac(char * in, size_t isize, char * out)
    {
        // uhash may overwrite data after the input
        // That's where fsbench stores checksum
        // If this function is called during 'decoding',
        // there's a hash stored already that will be destroyed,
647
648
649
650
651
652
653
















654
655
656
657
658
659
660
    MurmurHash3_x86_128(in, isize, 0, out);
}
void murmur_x64_128(char * in, size_t isize, char * out)
{
    MurmurHash3_x64_128(in, isize, 0, out);
}
#endif//FSBENCH_USE_MURMUR
















#ifdef FSBENCH_USE_QUICKLZZIP

extern "C"
{
#include "quicklzzip.h"
}
size_t qlzzip_c(char * in, size_t isize, char * out, size_t, void *)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
    MurmurHash3_x86_128(in, isize, 0, out);
}
void murmur_x64_128(char * in, size_t isize, char * out)
{
    MurmurHash3_x64_128(in, isize, 0, out);
}
#endif//FSBENCH_USE_MURMUR
#ifdef FSBENCH_USE_NAKAMICHI

extern "C"
{
unsigned int Compress(char* ret, char* src, unsigned int srcSize);
unsigned int Decompress(char* ret, char* src, unsigned int srcSize);
}
size_t nakamichi_c(char * in, size_t isize, char * out, size_t, void *)
{
    return Compress(out, in, isize);
}
size_t nakamichi_d(char * in, size_t isize, char * out, size_t osize, void *)
{
    return Decompress(out, in, isize);
}
#endif//FSBENCH_USE_NAKAMICHI
#ifdef FSBENCH_USE_QUICKLZZIP

extern "C"
{
#include "quicklzzip.h"
}
size_t qlzzip_c(char * in, size_t isize, char * out, size_t, void *)

Changes to src/simple_codecs.hpp.

124
125
126
127
128
129
130




131
132
133
134
135
136
137
size_t mmini_lzl_d(char * in, size_t isize, char * out, size_t osize, void * _);
#endif// FSBENCH_USE_MMINI
#ifdef FSBENCH_USE_MURMUR
void murmur_x86_32(char * in, size_t isize, char * out);
void murmur_x86_128(char * in, size_t isize, char * out);
void murmur_x64_128(char * in, size_t isize, char * out);
#endif//FSBENCH_USE_MURMUR




#ifdef FSBENCH_USE_QUICKLZZIP
size_t qlzzip_c(char * in, size_t isize, char * out, size_t osize, void * _);
#endif//FSBENCH_USE_QUICKLZZIP
#ifdef FSBENCH_USE_RLE64
size_t RLE64_c(char * in, size_t isize, char * out, size_t osize, void * _);
size_t RLE64_d(char * in, size_t isize, char * out, size_t osize, void * _);
size_t RLE32_c(char * in, size_t isize, char * out, size_t osize, void * _);







>
>
>
>







124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
size_t mmini_lzl_d(char * in, size_t isize, char * out, size_t osize, void * _);
#endif// FSBENCH_USE_MMINI
#ifdef FSBENCH_USE_MURMUR
void murmur_x86_32(char * in, size_t isize, char * out);
void murmur_x86_128(char * in, size_t isize, char * out);
void murmur_x64_128(char * in, size_t isize, char * out);
#endif//FSBENCH_USE_MURMUR
#ifdef FSBENCH_USE_NAKAMICHI
size_t nakamichi_c(char * in, size_t isize, char * out, size_t, void *);
size_t nakamichi_d(char * in, size_t isize, char * out, size_t osize, void *);
#endif//FSBENCH_USE_NAKAMICHI
#ifdef FSBENCH_USE_QUICKLZZIP
size_t qlzzip_c(char * in, size_t isize, char * out, size_t osize, void * _);
#endif//FSBENCH_USE_QUICKLZZIP
#ifdef FSBENCH_USE_RLE64
size_t RLE64_c(char * in, size_t isize, char * out, size_t osize, void * _);
size_t RLE64_d(char * in, size_t isize, char * out, size_t osize, void * _);
size_t RLE32_c(char * in, size_t isize, char * out, size_t osize, void * _);