Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 1 | /* |
| 2 | * qdist.c - QEMU helpers for handling frequency distributions of data. |
| 3 | * |
| 4 | * Copyright (C) 2016, Emilio G. Cota <cota@braap.org> |
| 5 | * |
| 6 | * License: GNU GPL, version 2 or later. |
| 7 | * See the COPYING file in the top-level directory. |
| 8 | */ |
Paolo Bonzini | e9abfcb | 2016-06-06 18:56:37 +0200 | [diff] [blame] | 9 | #include "qemu/osdep.h" |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 10 | #include "qemu/qdist.h" |
| 11 | |
| 12 | #include <math.h> |
| 13 | #ifndef NAN |
| 14 | #define NAN (0.0 / 0.0) |
| 15 | #endif |
| 16 | |
Emilio G. Cota | 11b7b07 | 2016-07-25 11:03:45 -0400 | [diff] [blame] | 17 | #define QDIST_EMPTY_STR "(empty)" |
| 18 | |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 19 | void qdist_init(struct qdist *dist) |
| 20 | { |
Emilio G. Cota | 071d405 | 2016-07-25 11:03:44 -0400 | [diff] [blame] | 21 | dist->entries = g_new(struct qdist_entry, 1); |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 22 | dist->size = 1; |
| 23 | dist->n = 0; |
| 24 | } |
| 25 | |
| 26 | void qdist_destroy(struct qdist *dist) |
| 27 | { |
| 28 | g_free(dist->entries); |
| 29 | } |
| 30 | |
| 31 | static inline int qdist_cmp_double(double a, double b) |
| 32 | { |
| 33 | if (a > b) { |
| 34 | return 1; |
| 35 | } else if (a < b) { |
| 36 | return -1; |
| 37 | } |
| 38 | return 0; |
| 39 | } |
| 40 | |
| 41 | static int qdist_cmp(const void *ap, const void *bp) |
| 42 | { |
| 43 | const struct qdist_entry *a = ap; |
| 44 | const struct qdist_entry *b = bp; |
| 45 | |
| 46 | return qdist_cmp_double(a->x, b->x); |
| 47 | } |
| 48 | |
| 49 | void qdist_add(struct qdist *dist, double x, long count) |
| 50 | { |
| 51 | struct qdist_entry *entry = NULL; |
| 52 | |
| 53 | if (dist->n) { |
| 54 | struct qdist_entry e; |
| 55 | |
| 56 | e.x = x; |
| 57 | entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); |
| 58 | } |
| 59 | |
| 60 | if (entry) { |
| 61 | entry->count += count; |
| 62 | return; |
| 63 | } |
| 64 | |
| 65 | if (unlikely(dist->n == dist->size)) { |
| 66 | dist->size *= 2; |
Emilio G. Cota | 071d405 | 2016-07-25 11:03:44 -0400 | [diff] [blame] | 67 | dist->entries = g_renew(struct qdist_entry, dist->entries, dist->size); |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 68 | } |
| 69 | dist->n++; |
| 70 | entry = &dist->entries[dist->n - 1]; |
| 71 | entry->x = x; |
| 72 | entry->count = count; |
| 73 | qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); |
| 74 | } |
| 75 | |
| 76 | void qdist_inc(struct qdist *dist, double x) |
| 77 | { |
| 78 | qdist_add(dist, x, 1); |
| 79 | } |
| 80 | |
| 81 | /* |
| 82 | * Unicode for block elements. See: |
| 83 | * https://en.wikipedia.org/wiki/Block_Elements |
| 84 | */ |
| 85 | static const gunichar qdist_blocks[] = { |
| 86 | 0x2581, |
| 87 | 0x2582, |
| 88 | 0x2583, |
| 89 | 0x2584, |
| 90 | 0x2585, |
| 91 | 0x2586, |
| 92 | 0x2587, |
| 93 | 0x2588 |
| 94 | }; |
| 95 | |
| 96 | #define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks) |
| 97 | |
| 98 | /* |
| 99 | * Print a distribution into a string. |
| 100 | * |
| 101 | * This function assumes that appropriate binning has been done on the input; |
| 102 | * see qdist_bin__internal() and qdist_pr_plain(). |
| 103 | * |
| 104 | * Callers must free the returned string with g_free(). |
| 105 | */ |
| 106 | static char *qdist_pr_internal(const struct qdist *dist) |
| 107 | { |
| 108 | double min, max; |
| 109 | GString *s = g_string_new(""); |
| 110 | size_t i; |
| 111 | |
| 112 | /* if only one entry, its printout will be either full or empty */ |
| 113 | if (dist->n == 1) { |
| 114 | if (dist->entries[0].count) { |
| 115 | g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]); |
| 116 | } else { |
| 117 | g_string_append_c(s, ' '); |
| 118 | } |
| 119 | goto out; |
| 120 | } |
| 121 | |
| 122 | /* get min and max counts */ |
| 123 | min = dist->entries[0].count; |
| 124 | max = min; |
| 125 | for (i = 0; i < dist->n; i++) { |
| 126 | struct qdist_entry *e = &dist->entries[i]; |
| 127 | |
| 128 | if (e->count < min) { |
| 129 | min = e->count; |
| 130 | } |
| 131 | if (e->count > max) { |
| 132 | max = e->count; |
| 133 | } |
| 134 | } |
| 135 | |
| 136 | for (i = 0; i < dist->n; i++) { |
| 137 | struct qdist_entry *e = &dist->entries[i]; |
| 138 | int index; |
| 139 | |
| 140 | /* make an exception with 0; instead of using block[0], print a space */ |
| 141 | if (e->count) { |
| 142 | /* divide first to avoid loss of precision when e->count == max */ |
| 143 | index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1); |
| 144 | g_string_append_unichar(s, qdist_blocks[index]); |
| 145 | } else { |
| 146 | g_string_append_c(s, ' '); |
| 147 | } |
| 148 | } |
| 149 | out: |
| 150 | return g_string_free(s, FALSE); |
| 151 | } |
| 152 | |
| 153 | /* |
| 154 | * Bin the distribution in @from into @n bins of consecutive, non-overlapping |
| 155 | * intervals, copying the result to @to. |
| 156 | * |
| 157 | * This function is internal to qdist: only this file and test code should |
| 158 | * ever call it. |
| 159 | * |
| 160 | * Note: calling this function on an already-binned qdist is a bug. |
| 161 | * |
| 162 | * If @n == 0 or @from->n == 1, use @from->n. |
| 163 | */ |
| 164 | void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n) |
| 165 | { |
| 166 | double xmin, xmax; |
| 167 | double step; |
| 168 | size_t i, j; |
| 169 | |
| 170 | qdist_init(to); |
| 171 | |
| 172 | if (from->n == 0) { |
| 173 | return; |
| 174 | } |
| 175 | if (n == 0 || from->n == 1) { |
| 176 | n = from->n; |
| 177 | } |
| 178 | |
| 179 | /* set equally-sized bins between @from's left and right */ |
| 180 | xmin = qdist_xmin(from); |
| 181 | xmax = qdist_xmax(from); |
| 182 | step = (xmax - xmin) / n; |
| 183 | |
| 184 | if (n == from->n) { |
| 185 | /* if @from's entries are equally spaced, no need to re-bin */ |
| 186 | for (i = 0; i < from->n; i++) { |
| 187 | if (from->entries[i].x != xmin + i * step) { |
| 188 | goto rebin; |
| 189 | } |
| 190 | } |
| 191 | /* they're equally spaced, so copy the dist and bail out */ |
Emilio G. Cota | 071d405 | 2016-07-25 11:03:44 -0400 | [diff] [blame] | 192 | to->entries = g_renew(struct qdist_entry, to->entries, n); |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 193 | to->n = from->n; |
| 194 | memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); |
| 195 | return; |
| 196 | } |
| 197 | |
| 198 | rebin: |
| 199 | j = 0; |
| 200 | for (i = 0; i < n; i++) { |
| 201 | double x; |
| 202 | double left, right; |
| 203 | |
| 204 | left = xmin + i * step; |
| 205 | right = xmin + (i + 1) * step; |
| 206 | |
| 207 | /* Add x, even if it might not get any counts later */ |
| 208 | x = left; |
| 209 | qdist_add(to, x, 0); |
| 210 | |
| 211 | /* |
| 212 | * To avoid double-counting we capture [left, right) ranges, except for |
| 213 | * the righmost bin, which captures a [left, right] range. |
| 214 | */ |
| 215 | while (j < from->n && (from->entries[j].x < right || i == n - 1)) { |
| 216 | struct qdist_entry *o = &from->entries[j]; |
| 217 | |
| 218 | qdist_add(to, x, o->count); |
| 219 | j++; |
| 220 | } |
| 221 | } |
| 222 | } |
| 223 | |
| 224 | /* |
| 225 | * Print @dist into a string, after re-binning it into @n bins of consecutive, |
| 226 | * non-overlapping intervals. |
| 227 | * |
| 228 | * If @n == 0, use @orig->n. |
| 229 | * |
| 230 | * Callers must free the returned string with g_free(). |
| 231 | */ |
| 232 | char *qdist_pr_plain(const struct qdist *dist, size_t n) |
| 233 | { |
| 234 | struct qdist binned; |
| 235 | char *ret; |
| 236 | |
| 237 | if (dist->n == 0) { |
Emilio G. Cota | 11b7b07 | 2016-07-25 11:03:45 -0400 | [diff] [blame] | 238 | return g_strdup(QDIST_EMPTY_STR); |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 239 | } |
| 240 | qdist_bin__internal(&binned, dist, n); |
| 241 | ret = qdist_pr_internal(&binned); |
| 242 | qdist_destroy(&binned); |
| 243 | return ret; |
| 244 | } |
| 245 | |
| 246 | static char *qdist_pr_label(const struct qdist *dist, size_t n_bins, |
| 247 | uint32_t opt, bool is_left) |
| 248 | { |
| 249 | const char *percent; |
| 250 | const char *lparen; |
| 251 | const char *rparen; |
| 252 | GString *s; |
| 253 | double x1, x2, step; |
| 254 | double x; |
| 255 | double n; |
| 256 | int dec; |
| 257 | |
| 258 | s = g_string_new(""); |
| 259 | if (!(opt & QDIST_PR_LABELS)) { |
| 260 | goto out; |
| 261 | } |
| 262 | |
| 263 | dec = opt & QDIST_PR_NODECIMAL ? 0 : 1; |
| 264 | percent = opt & QDIST_PR_PERCENT ? "%" : ""; |
| 265 | |
| 266 | n = n_bins ? n_bins : dist->n; |
| 267 | x = is_left ? qdist_xmin(dist) : qdist_xmax(dist); |
| 268 | step = (qdist_xmax(dist) - qdist_xmin(dist)) / n; |
| 269 | |
| 270 | if (opt & QDIST_PR_100X) { |
| 271 | x *= 100.0; |
| 272 | step *= 100.0; |
| 273 | } |
| 274 | if (opt & QDIST_PR_NOBINRANGE) { |
| 275 | lparen = rparen = ""; |
| 276 | x1 = x; |
| 277 | x2 = x; /* unnecessary, but a dumb compiler might not figure it out */ |
| 278 | } else { |
| 279 | lparen = "["; |
| 280 | rparen = is_left ? ")" : "]"; |
| 281 | if (is_left) { |
| 282 | x1 = x; |
| 283 | x2 = x + step; |
| 284 | } else { |
| 285 | x1 = x - step; |
| 286 | x2 = x; |
| 287 | } |
| 288 | } |
| 289 | g_string_append_printf(s, "%s%.*f", lparen, dec, x1); |
| 290 | if (!(opt & QDIST_PR_NOBINRANGE)) { |
| 291 | g_string_append_printf(s, ",%.*f%s", dec, x2, rparen); |
| 292 | } |
| 293 | g_string_append(s, percent); |
| 294 | out: |
| 295 | return g_string_free(s, FALSE); |
| 296 | } |
| 297 | |
| 298 | /* |
| 299 | * Print the distribution's histogram into a string. |
| 300 | * |
| 301 | * See also: qdist_pr_plain(). |
| 302 | * |
| 303 | * Callers must free the returned string with g_free(). |
| 304 | */ |
| 305 | char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt) |
| 306 | { |
| 307 | const char *border = opt & QDIST_PR_BORDER ? "|" : ""; |
| 308 | char *llabel, *rlabel; |
| 309 | char *hgram; |
| 310 | GString *s; |
| 311 | |
| 312 | if (dist->n == 0) { |
Emilio G. Cota | 11b7b07 | 2016-07-25 11:03:45 -0400 | [diff] [blame] | 313 | return g_strdup(QDIST_EMPTY_STR); |
Emilio G. Cota | bf3afd5 | 2016-06-08 14:55:26 -0400 | [diff] [blame] | 314 | } |
| 315 | |
| 316 | s = g_string_new(""); |
| 317 | |
| 318 | llabel = qdist_pr_label(dist, n_bins, opt, true); |
| 319 | rlabel = qdist_pr_label(dist, n_bins, opt, false); |
| 320 | hgram = qdist_pr_plain(dist, n_bins); |
| 321 | g_string_append_printf(s, "%s%s%s%s%s", |
| 322 | llabel, border, hgram, border, rlabel); |
| 323 | g_free(llabel); |
| 324 | g_free(rlabel); |
| 325 | g_free(hgram); |
| 326 | |
| 327 | return g_string_free(s, FALSE); |
| 328 | } |
| 329 | |
| 330 | static inline double qdist_x(const struct qdist *dist, int index) |
| 331 | { |
| 332 | if (dist->n == 0) { |
| 333 | return NAN; |
| 334 | } |
| 335 | return dist->entries[index].x; |
| 336 | } |
| 337 | |
| 338 | double qdist_xmin(const struct qdist *dist) |
| 339 | { |
| 340 | return qdist_x(dist, 0); |
| 341 | } |
| 342 | |
| 343 | double qdist_xmax(const struct qdist *dist) |
| 344 | { |
| 345 | return qdist_x(dist, dist->n - 1); |
| 346 | } |
| 347 | |
| 348 | size_t qdist_unique_entries(const struct qdist *dist) |
| 349 | { |
| 350 | return dist->n; |
| 351 | } |
| 352 | |
| 353 | unsigned long qdist_sample_count(const struct qdist *dist) |
| 354 | { |
| 355 | unsigned long count = 0; |
| 356 | size_t i; |
| 357 | |
| 358 | for (i = 0; i < dist->n; i++) { |
| 359 | struct qdist_entry *e = &dist->entries[i]; |
| 360 | |
| 361 | count += e->count; |
| 362 | } |
| 363 | return count; |
| 364 | } |
| 365 | |
| 366 | static double qdist_pairwise_avg(const struct qdist *dist, size_t index, |
| 367 | size_t n, unsigned long count) |
| 368 | { |
| 369 | /* amortize the recursion by using a base case > 2 */ |
| 370 | if (n <= 8) { |
| 371 | size_t i; |
| 372 | double ret = 0; |
| 373 | |
| 374 | for (i = 0; i < n; i++) { |
| 375 | struct qdist_entry *e = &dist->entries[index + i]; |
| 376 | |
| 377 | ret += e->x * e->count / count; |
| 378 | } |
| 379 | return ret; |
| 380 | } else { |
| 381 | size_t n2 = n / 2; |
| 382 | |
| 383 | return qdist_pairwise_avg(dist, index, n2, count) + |
| 384 | qdist_pairwise_avg(dist, index + n2, n - n2, count); |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | double qdist_avg(const struct qdist *dist) |
| 389 | { |
| 390 | unsigned long count; |
| 391 | |
| 392 | count = qdist_sample_count(dist); |
| 393 | if (!count) { |
| 394 | return NAN; |
| 395 | } |
| 396 | return qdist_pairwise_avg(dist, 0, dist->n, count); |
| 397 | } |