/* Shuffle lines of text. This is the shuf utility
Copyright (C) 2006-2018 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Written by Paul Eggert. */ The GNUv3 license
#include <config.h> Provides system specific information
#include <sys/types.h> Provides system data types
#include "system.h" ...!includes auto-comment...
#include "die.h" ...!includes auto-comment...
#include "error.h" ...!includes auto-comment...
#include "fadvise.h" ...!includes auto-comment...
#include "getopt.h" ...!includes auto-comment...
#include "linebuffer.h" ...!includes auto-comment...
#include "quote.h" ...!includes auto-comment...
#include "randint.h" ...!includes auto-comment...
#include "randperm.h" ...!includes auto-comment...
#include "read-file.h" ...!includes auto-comment...
#include "stdio--.h" ...!includes auto-comment...
#include "xdectoint.h" ...!includes auto-comment...
#include "xstrtol.h" ...!includes auto-comment...
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "shuf" Line 39
#define AUTHORS proper_name ("Paul Eggert") Line 41
/* For reservoir-sampling, allocate the reservoir lines in batches. */
enum { RESERVOIR_LINES_INCREMENT = 1024 }; Line 44
/* reservoir-sampling introduces CPU overhead for small inputs.
So only enable it for inputs >= this limit.
This limit was determined using these commands:
$ for p in $(seq 7); do src/seq $((10**$p)) > 10p$p.in; done
$ for p in $(seq 7); do time shuf-nores -n10 10p$p.in >/dev/null; done
$ for p in $(seq 7); do time shuf -n10 10p$p.in >/dev/null; done .*/
enum { RESERVOIR_MIN_INPUT = 8192 * 1024 }; Line 52
void Line 55
usage (int status) Line 56
{
if (status != EXIT_SUCCESS) Line 58
emit_try_help (); ...!common auto-comment...
else Line 60
{
printf (_("\ Line 62
Usage: %s [OPTION]... [FILE]\n\ Line 63
or: %s -e [OPTION]... [ARG]...\n\ Line 64
or: %s -i LO-HI [OPTION]...\n\ Line 65
"), Line 66
program_name, program_name, program_name); Line 67
fputs (_("\ Line 68
Write a random permutation of the input lines to standard output.\n\ Line 69
"), stdout); Line 70
emit_stdin_note (); ...!common auto-comment...
emit_mandatory_arg_note (); ...!common auto-comment...
fputs (_("\ Line 75
-e, --echo treat each ARG as an input line\n\ Line 76
-i, --input-range=LO-HI treat each number LO through HI as an input line\n\ Line 77
-n, --head-count=COUNT output at most COUNT lines\n\ Line 78
-o, --output=FILE write result to FILE instead of standard output\n\ Line 79
--random-source=FILE get random bytes from FILE\n\ Line 80
-r, --repeat output lines can be repeated\n\ Line 81
"), stdout); Line 82
fputs (_("\ Line 83
-z, --zero-terminated line delimiter is NUL, not newline\n\ Line 84
"), stdout); Line 85
fputs (HELP_OPTION_DESCRIPTION, stdout); Line 86
fputs (VERSION_OPTION_DESCRIPTION, stdout); Line 87
emit_ancillary_info (PROGRAM_NAME); Line 88
}
exit (status); Line 91
} Block 3
/* For long options that have no equivalent short option, use a
non-character as a pseudo short option, starting with CHAR_MAX + 1. */
enum Line 96
{
RANDOM_SOURCE_OPTION = CHAR_MAX + 1 Line 98
}; Block 4
static struct option const long_opts[] = Line 101
{
{"echo", no_argument, NULL, 'e'}, Line 103
{"input-range", required_argument, NULL, 'i'}, Line 104
{"head-count", required_argument, NULL, 'n'}, Line 105
{"output", required_argument, NULL, 'o'}, Line 106
{"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, Line 107
{"repeat", no_argument, NULL, 'r'}, Line 108
{"zero-terminated", no_argument, NULL, 'z'}, Line 109
{GETOPT_HELP_OPTION_DECL}, Line 110
{GETOPT_VERSION_OPTION_DECL}, Line 111
{0, 0, 0, 0}, Line 112
}; Block 5
static void Line 115
input_from_argv (char **operand, int n_operands, char eolbyte) Line 116
{
char *p; Line 118
size_t size = n_operands; Line 119
int i; Line 120
for (i = 0; i < n_operands; i++) Line 122
size += strlen (operand[i]); Line 123
p = xmalloc (size); Line 124
for (i = 0; i < n_operands; i++) Line 126
{
char *p1 = stpcpy (p, operand[i]); Line 128
operand[i] = p; Line 129
p = p1; Line 130
*p++ = eolbyte; Line 131
}
operand[n_operands] = p; Line 134
} Block 6
/* Return the start of the next line after LINE. The current line
ends in EOLBYTE, and is guaranteed to end before LINE + N. */
static char * Line 140
next_line (char *line, char eolbyte, size_t n) Line 141
{
char *p = memchr (line, eolbyte, n); Line 143
return p + 1; Line 144
} Block 7
/* Return the size of the input if possible or OFF_T_MAX if not. */
static off_t Line 149
input_size (void) Line 150
{
off_t file_size; Line 152
struct stat stat_buf; Line 154
if (fstat (STDIN_FILENO, &stat_buf) != 0) Line 155...!syscalls auto-comment......!syscalls auto-comment...
return OFF_T_MAX; Line 156
if (usable_st_size (&stat_buf)) Line 157
file_size = stat_buf.st_size; Line 158
else Line 159
return OFF_T_MAX; Line 160
off_t input_offset = lseek (STDIN_FILENO, 0, SEEK_CUR); Line 162
if (input_offset < 0) Line 163
return OFF_T_MAX; Line 164
file_size -= input_offset; Line 166
return file_size; Line 168
} Block 8
/* Read all lines and store up to K permuted lines in *OUT_RSRV.
Return the number of lines read, up to a maximum of K. */
static size_t Line 174
read_input_reservoir_sampling (FILE *in, char eolbyte, size_t k, Line 175
struct randint_source *s, Line 176
struct linebuffer **out_rsrv) Line 177
{
randint n_lines = 0; Line 179
size_t n_alloc_lines = MIN (k, RESERVOIR_LINES_INCREMENT); Line 180
struct linebuffer *line = NULL; Line 181
struct linebuffer *rsrv; Line 182
rsrv = xcalloc (n_alloc_lines, sizeof (struct linebuffer)); Line 184
/* Fill the first K lines, directly into the reservoir. */
while (n_lines < k Line 187
&& (line = Line 188
readlinebuffer_delim (&rsrv[n_lines], in, eolbyte)) != NULL) Line 189
{
n_lines++; Line 191
/* Enlarge reservoir. */
if (n_lines >= n_alloc_lines) Line 194
{
n_alloc_lines += RESERVOIR_LINES_INCREMENT; Line 196
rsrv = xnrealloc (rsrv, n_alloc_lines, sizeof (struct linebuffer)); Line 197
memset (&rsrv[n_lines], 0, Line 198
RESERVOIR_LINES_INCREMENT * sizeof (struct linebuffer)); Line 199
}
}
/* last line wasn't NULL - so there may be more lines to read. */
if (line != NULL) Line 204
{
struct linebuffer dummy; Line 206
initbuffer (&dummy); /* space for lines not put in reservoir. */ Line 207
/* Choose the fate of the next line, with decreasing probability (as
n_lines increases in size).
If the line will be used, store it directly in the reservoir.
Otherwise, store it in dummy space.
With 'struct linebuffer', storing into existing buffer will reduce
re-allocations (will only re-allocate if the new line is longer than
the currently allocated space). */
do
{
randint j = randint_choose (s, n_lines + 1); /* 0 .. n_lines. */ Line 220
line = (j < k) ? (&rsrv[j]) : (&dummy); Line 221
}
while (readlinebuffer_delim (line, in, eolbyte) != NULL && n_lines++); Line 223
if (! n_lines) Line 225
die (EXIT_FAILURE, EOVERFLOW, _("too many input lines")); Line 226
freebuffer (&dummy); Line 228
}
/* no more input lines, or an input error. */
if (ferror (in)) Line 232
die (EXIT_FAILURE, errno, _("read error")); Line 233
*out_rsrv = rsrv; Line 235
return MIN (k, n_lines); Line 236
} Block 9
static int Line 239
write_permuted_output_reservoir (size_t n_lines, struct linebuffer *lines, Line 240
size_t const *permutation) Line 241
{
for (size_t i = 0; i < n_lines; i++) Line 243
{
const struct linebuffer *p = &lines[permutation[i]]; Line 245
if (fwrite (p->buffer, sizeof (char), p->length, stdout) != p->length) Line 246...!syscalls auto-comment...
return -1; Line 247
}
return 0; Line 250
} Block 10
/* Read data from file IN. Input lines are delimited by EOLBYTE;
silently append a trailing EOLBYTE if the file ends in some other
byte. Store a pointer to the resulting array of lines into *PLINE.
Return the number of lines read. Report an error and exit on
failure. */
static size_t Line 259
read_input (FILE *in, char eolbyte, char ***pline) Line 260
{
char *p; Line 262
char *buf = NULL; Line 263
size_t used; Line 264
char *lim; Line 265
char **line; Line 266
size_t n_lines; Line 267
/* TODO: We should limit the amount of data read here,
to less than RESERVOIR_MIN_INPUT. I.e., adjust fread_file() to support
taking a byte limit. We'd then need to ensure we handle a line spanning
this boundary. With that in place we could set use_reservoir_sampling
when used==RESERVOIR_MIN_INPUT, and have read_input_reservoir_sampling()
call a wrapper function to populate a linebuffer from the internal pline
or if none left, stdin. Doing that would give better performance by
avoiding the reservoir CPU overhead when reading < RESERVOIR_MIN_INPUT
from a pipe, and allow us to dispense with the input_size() function. */
if (!(buf = fread_file (in, &used))) Line 278
die (EXIT_FAILURE, errno, _("read error")); Line 279
if (used && buf[used - 1] != eolbyte) Line 281
buf[used++] = eolbyte; Line 282
lim = buf + used; Line 284
n_lines = 0; Line 286
for (p = buf; p < lim; p = next_line (p, eolbyte, lim - p)) Line 287
n_lines++; Line 288
*pline = line = xnmalloc (n_lines + 1, sizeof *line); Line 290
line[0] = p = buf; Line 292
for (size_t i = 1; i <= n_lines; i++) Line 293
line[i] = p = next_line (p, eolbyte, lim - p); Line 294
return n_lines; Line 296
} Block 11
/* Output N_LINES lines to stdout from LINE array,
chosen by the indices in PERMUTATION.
PERMUTATION and LINE must have at least N_LINES elements.
Strings in LINE must include the line-terminator character. */
static int Line 303
write_permuted_lines (size_t n_lines, char *const *line, Line 304
size_t const *permutation) Line 305
{
for (size_t i = 0; i < n_lines; i++) Line 307
{
char *const *p = line + permutation[i]; Line 309
size_t len = p[1] - p[0]; Line 310
if (fwrite (p[0], sizeof *p[0], len, stdout) != len) Line 311...!syscalls auto-comment...
return -1; Line 312
}
return 0; Line 315
} Block 12
/* Output N_LINES of numbers to stdout, from PERMUTATION array.
PERMUTATION must have at least N_LINES elements. */
static int Line 320
write_permuted_numbers (size_t n_lines, size_t lo_input, Line 321
size_t const *permutation, char eolbyte) Line 322
{
for (size_t i = 0; i < n_lines; i++) Line 324
{
unsigned long int n = lo_input + permutation[i]; Line 326
if (printf ("%lu%c", n, eolbyte) < 0) Line 327
return -1; Line 328
}
return 0; Line 331
} Block 13
/* Output COUNT numbers to stdout, chosen randomly from range
LO_INPUT through HI_INPUT. */
static int Line 336
write_random_numbers (struct randint_source *s, size_t count, Line 337
size_t lo_input, size_t hi_input, char eolbyte) Line 338
{
const randint range = hi_input - lo_input + 1; Line 340
for (size_t i = 0; i < count; i++) Line 342
{
unsigned long int j = lo_input + randint_choose (s, range); Line 344
if (printf ("%lu%c", j, eolbyte) < 0) Line 345
return -1; Line 346
}
return 0; Line 349
} Block 14
/* Output COUNT lines to stdout from LINES array.
LINES must have at least N_LINES elements in it.
Strings in LINES_ must include the line-terminator character. */
static int Line 355
write_random_lines (struct randint_source *s, size_t count, Line 356
char *const *lines, size_t n_lines) Line 357
{
for (size_t i = 0; i < count; i++) Line 359
{
const randint j = randint_choose (s, n_lines); Line 361
char *const *p = lines + j; Line 362
size_t len = p[1] - p[0]; Line 363
if (fwrite (p[0], sizeof *p[0], len, stdout) != len) Line 364...!syscalls auto-comment...
return -1; Line 365
}
return 0; Line 368
} Block 15
int
main (int argc, char **argv) Line 372
{
bool echo = false; Line 374
bool input_range = false; Line 375
size_t lo_input = SIZE_MAX; Line 376
size_t hi_input = 0; Line 377
size_t head_lines = SIZE_MAX; Line 378
char const *outfile = NULL; Line 379
char *random_source = NULL; Line 380
char eolbyte = '\n'; Line 381
char **input_lines = NULL; Line 382
bool use_reservoir_sampling = false; Line 383
bool repeat = false; Line 384
int optc; Line 386
int n_operands; Line 387
char **operand; Line 388
size_t n_lines; Line 389
char **line = NULL; Line 390
struct linebuffer *reservoir = NULL; Line 391
struct randint_source *randint_source; Line 392
size_t *permutation = NULL; Line 393
int i; Line 394
initialize_main (&argc, &argv); VMS-specific entry point handling wildcard expansion
set_program_name (argv[0]); Retains program name and discards path
setlocale (LC_ALL, ""); Sets up internationalization (i18n)
bindtextdomain (PACKAGE, LOCALEDIR); Assigns i18n directorySets text domain for _() [gettext()] function
textdomain (PACKAGE); Sets text domain for _() [gettext()] function
atexit (close_stdout); Close stdout on exit (see gnulib)
while ((optc = getopt_long (argc, argv, "ei:n:o:rz", long_opts, NULL)) != -1) Line 404
switch (optc) Line 405
{
case 'e': Line 407
echo = true; Line 408
break; Line 409
case 'i': Line 411
{
char *p = strchr (optarg, '-'); Line 413
char const *hi_optarg = optarg; Line 414
bool invalid = !p; Line 415
if (input_range) Line 417
die (EXIT_FAILURE, 0, _("multiple -i options specified")); Line 418
input_range = true; Line 419
if (p) Line 421
{
*p = '\0'; Line 423
lo_input = xdectoumax (optarg, 0, SIZE_MAX, "", Line 424
_("invalid input range"), 0); Line 425
*p = '-'; Line 426
hi_optarg = p + 1; Line 427
}
hi_input = xdectoumax (hi_optarg, 0, SIZE_MAX, "", Line 430
_("invalid input range"), 0); Line 431
n_lines = hi_input - lo_input + 1; Line 433
invalid |= ((lo_input <= hi_input) == (n_lines == 0)); Line 434
if (invalid) Line 435
die (EXIT_FAILURE, errno, "%s: %s", _("invalid input range"), Line 436
quote (optarg)); Line 437
}
break; Line 439
case 'n': Line 441
{
unsigned long int argval; Line 443
strtol_error e = xstrtoul (optarg, NULL, 10, &argval, NULL); Line 444
if (e == LONGINT_OK) Line 446
head_lines = MIN (head_lines, argval); Line 447
else if (e != LONGINT_OVERFLOW) Line 448
die (EXIT_FAILURE, 0, _("invalid line count: %s"), Line 449
quote (optarg)); Line 450
}
break; Line 452
case 'o': Line 454
if (outfile && !STREQ (outfile, optarg)) Line 455
die (EXIT_FAILURE, 0, _("multiple output files specified")); Line 456
outfile = optarg; Line 457
break; Line 458
case RANDOM_SOURCE_OPTION: Line 460
if (random_source && !STREQ (random_source, optarg)) Line 461
die (EXIT_FAILURE, 0, _("multiple random sources specified")); Line 462
random_source = optarg; Line 463
break; Line 464
case 'r': Line 466
repeat = true; Line 467
break; Line 468
case 'z': Line 470
eolbyte = '\0'; Line 471
break; Line 472
case_GETOPT_HELP_CHAR; Line 474
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); Line 475
default: Line 476
usage (EXIT_FAILURE); Line 477
}
n_operands = argc - optind; Line 480
operand = argv + optind; Line 481
/* Check invalid usage. */
if (echo && input_range) Line 484
{
error (0, 0, _("cannot combine -e and -i options")); Line 486
usage (EXIT_FAILURE); Line 487
}
if (input_range ? 0 < n_operands : !echo && 1 < n_operands) Line 489
{
error (0, 0, _("extra operand %s"), quote (operand[!input_range])); Line 491
usage (EXIT_FAILURE); Line 492
}
/* Prepare input. */
if (echo) Line 496
{
input_from_argv (operand, n_operands, eolbyte); Line 498
n_lines = n_operands; Line 499
line = operand; Line 500
}
else if (input_range) Line 502
{
n_lines = hi_input - lo_input + 1; Line 504
line = NULL; Line 505
}
else Line 507
{
/* If an input file is specified, re-open it as stdin. */
if (n_operands == 1) Line 510
if (! (STREQ (operand[0], "-") || ! head_lines Line 511
|| freopen (operand[0], "r", stdin))) Line 512...!syscalls auto-comment...
die (EXIT_FAILURE, errno, "%s", quotef (operand[0])); Line 513
fadvise (stdin, FADVISE_SEQUENTIAL); Line 515...!syscalls auto-comment...
if (! repeat && head_lines != SIZE_MAX Line 517
&& (! head_lines || input_size () > RESERVOIR_MIN_INPUT)) Line 518
{
use_reservoir_sampling = true; Line 520
n_lines = SIZE_MAX; /* unknown number of input lines, for now. */ Line 521
}
else Line 523
{
n_lines = read_input (stdin, eolbyte, &input_lines); Line 525
line = input_lines; Line 526
}
}
if (! repeat) Line 530
head_lines = MIN (head_lines, n_lines); Line 531
randint_source = randint_all_new (random_source, Line 533
(use_reservoir_sampling || repeat Line 534
? SIZE_MAX Line 535
: randperm_bound (head_lines, n_lines))); Line 536
if (! randint_source) Line 537
die (EXIT_FAILURE, errno, "%s", quotef (random_source)); Line 538
if (use_reservoir_sampling) Line 540
{
/* Instead of reading the entire file into 'line',
use reservoir-sampling to store just "head_lines" random lines. */
n_lines = read_input_reservoir_sampling (stdin, eolbyte, head_lines, Line 544
randint_source, &reservoir); Line 545
head_lines = n_lines; Line 546
}
/* Close stdin now, rather than earlier, so that randint_all_new
doesn't have to worry about opening something other than
stdin. */
if (! (echo || input_range) Line 552
&& (fclose (stdin) != 0)) Line 553...!syscalls auto-comment...
die (EXIT_FAILURE, errno, _("read error")); Line 554
if (!repeat) Line 556
permutation = randperm_new (randint_source, head_lines, n_lines); Line 557
if (outfile && ! freopen (outfile, "w", stdout)) Line 559...!syscalls auto-comment...
die (EXIT_FAILURE, errno, "%s", quotef (outfile)); Line 560
/* Generate output according to requested method */
if (repeat) Line 563
{
if (head_lines == 0) Line 565
i = 0; Line 566
else Line 567
{
if (n_lines == 0) Line 569
die (EXIT_FAILURE, 0, _("no lines to repeat")); Line 570
if (input_range) Line 571
i = write_random_numbers (randint_source, head_lines, Line 572
lo_input, hi_input, eolbyte); Line 573
else Line 574
i = write_random_lines (randint_source, head_lines, line, n_lines); Line 575
}
}
else Line 578
{
if (use_reservoir_sampling) Line 580
i = write_permuted_output_reservoir (n_lines, reservoir, permutation); Line 581
else if (input_range) Line 582
i = write_permuted_numbers (head_lines, lo_input, Line 583
permutation, eolbyte); Line 584
else Line 585
i = write_permuted_lines (head_lines, line, permutation); Line 586
}
if (i != 0) Line 589
die (EXIT_FAILURE, errno, _("write error")); Line 590
#ifdef lint Line 592
free (permutation); Line 593
randint_all_free (randint_source); Line 594
if (input_lines) Line 595
{
free (input_lines[0]); Line 597
free (input_lines); Line 598
}
if (reservoir) Line 600
{
size_t j; Line 602
for (j = 0; j < n_lines; ++j) Line 603
freebuffer (&reservoir[j]); Line 604
free (reservoir); Line 605
}
#endif Line 607
return EXIT_SUCCESS; Line 609
} Block 16