/* Copyright 2013 Google Inc. All Rights Reserved. | |
Distributed under MIT license. | |
See file LICENSE for detail or copy at https://opensource.org/licenses/MIT | |
*/ | |
/* Literal cost model to allow backward reference replacement to be efficient. | |
*/ | |
#include "./literal_cost.h" | |
#include "../common/types.h" | |
#include "./fast_log.h" | |
#include "./port.h" | |
#include "./utf8_util.h" | |
#if defined(__cplusplus) || defined(c_plusplus) | |
extern "C" { | |
#endif | |
static size_t UTF8Position(size_t last, size_t c, size_t clamp) { | |
if (c < 128) { | |
return 0; /* Next one is the 'Byte 1' again. */ | |
} else if (c >= 192) { /* Next one is the 'Byte 2' of utf-8 encoding. */ | |
return BROTLI_MIN(size_t, 1, clamp); | |
} else { | |
/* Let's decide over the last byte if this ends the sequence. */ | |
if (last < 0xe0) { | |
return 0; /* Completed two or three byte coding. */ | |
} else { /* Next one is the 'Byte 3' of utf-8 encoding. */ | |
return BROTLI_MIN(size_t, 2, clamp); | |
} | |
} | |
} | |
static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask, | |
const uint8_t *data) { | |
size_t counts[3] = { 0 }; | |
size_t max_utf8 = 1; /* should be 2, but 1 compresses better. */ | |
size_t last_c = 0; | |
size_t utf8_pos = 0; | |
size_t i; | |
for (i = 0; i < len; ++i) { | |
size_t c = data[(pos + i) & mask]; | |
utf8_pos = UTF8Position(last_c, c, 2); | |
++counts[utf8_pos]; | |
last_c = c; | |
} | |
if (counts[2] < 500) { | |
max_utf8 = 1; | |
} | |
if (counts[1] + counts[2] < 25) { | |
max_utf8 = 0; | |
} | |
return max_utf8; | |
} | |
static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, | |
const uint8_t *data, float *cost) { | |
/* max_utf8 is 0 (normal ascii single byte modeling), | |
1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling). */ | |
const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data); | |
size_t histogram[3][256] = { { 0 } }; | |
size_t window_half = 495; | |
size_t in_window = BROTLI_MIN(size_t, window_half, len); | |
size_t in_window_utf8[3] = { 0 }; | |
size_t i; | |
{ /* Bootstrap histograms. */ | |
size_t last_c = 0; | |
size_t utf8_pos = 0; | |
for (i = 0; i < in_window; ++i) { | |
size_t c = data[(pos + i) & mask]; | |
++histogram[utf8_pos][c]; | |
++in_window_utf8[utf8_pos]; | |
utf8_pos = UTF8Position(last_c, c, max_utf8); | |
last_c = c; | |
} | |
} | |
/* Compute bit costs with sliding window. */ | |
for (i = 0; i < len; ++i) { | |
if (i >= window_half) { | |
/* Remove a byte in the past. */ | |
size_t c = | |
i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask]; | |
size_t last_c = | |
i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask]; | |
size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8); | |
--histogram[utf8_pos2][data[(pos + i - window_half) & mask]]; | |
--in_window_utf8[utf8_pos2]; | |
} | |
if (i + window_half < len) { | |
/* Add a byte in the future. */ | |
size_t c = data[(pos + i + window_half - 1) & mask]; | |
size_t last_c = data[(pos + i + window_half - 2) & mask]; | |
size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8); | |
++histogram[utf8_pos2][data[(pos + i + window_half) & mask]]; | |
++in_window_utf8[utf8_pos2]; | |
} | |
{ | |
size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask]; | |
size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask]; | |
size_t utf8_pos = UTF8Position(last_c, c, max_utf8); | |
size_t masked_pos = (pos + i) & mask; | |
size_t histo = histogram[utf8_pos][data[masked_pos]]; | |
double lit_cost; | |
if (histo == 0) { | |
histo = 1; | |
} | |
lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo); | |
lit_cost += 0.02905; | |
if (lit_cost < 1.0) { | |
lit_cost *= 0.5; | |
lit_cost += 0.5; | |
} | |
/* Make the first bytes more expensive -- seems to help, not sure why. | |
Perhaps because the entropy source is changing its properties | |
rapidly in the beginning of the file, perhaps because the beginning | |
of the data is a statistical "anomaly". */ | |
if (i < 2000) { | |
lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35); | |
} | |
cost[i] = (float)lit_cost; | |
} | |
} | |
} | |
void BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, | |
const uint8_t *data, float *cost) { | |
if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) { | |
EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost); | |
return; | |
} else { | |
size_t histogram[256] = { 0 }; | |
size_t window_half = 2000; | |
size_t in_window = BROTLI_MIN(size_t, window_half, len); | |
/* Bootstrap histogram. */ | |
size_t i; | |
for (i = 0; i < in_window; ++i) { | |
++histogram[data[(pos + i) & mask]]; | |
} | |
/* Compute bit costs with sliding window. */ | |
for (i = 0; i < len; ++i) { | |
size_t histo; | |
if (i >= window_half) { | |
/* Remove a byte in the past. */ | |
--histogram[data[(pos + i - window_half) & mask]]; | |
--in_window; | |
} | |
if (i + window_half < len) { | |
/* Add a byte in the future. */ | |
++histogram[data[(pos + i + window_half) & mask]]; | |
++in_window; | |
} | |
histo = histogram[data[(pos + i) & mask]]; | |
if (histo == 0) { | |
histo = 1; | |
} | |
{ | |
double lit_cost = FastLog2(in_window) - FastLog2(histo); | |
lit_cost += 0.029; | |
if (lit_cost < 1.0) { | |
lit_cost *= 0.5; | |
lit_cost += 0.5; | |
} | |
cost[i] = (float)lit_cost; | |
} | |
} | |
} | |
} | |
#if defined(__cplusplus) || defined(c_plusplus) | |
} /* extern "C" */ | |
#endif |