/* uniq -- remove duplicate lines from a sorted file This is the uniq utility
Copyright (C) 1986-2018 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */ The GNUv3 license
/* Written by Richard M. Stallman and David MacKenzie. */
#include <config.h> Provides system specific information
#include <getopt.h> ...!includes auto-comment...
#include <sys/types.h> Provides system data types
#include "system.h" ...!includes auto-comment...
#include "argmatch.h" ...!includes auto-comment...
#include "linebuffer.h" ...!includes auto-comment...
#include "die.h" ...!includes auto-comment...
#include "error.h" ...!includes auto-comment...
#include "fadvise.h" ...!includes auto-comment...
#include "hard-locale.h" ...!includes auto-comment...
#include "posixver.h" ...!includes auto-comment...
#include "stdio--.h" ...!includes auto-comment...
#include "xmemcoll.h" ...!includes auto-comment...
#include "xstrtol.h" ...!includes auto-comment...
#include "memcasecmp.h" ...!includes auto-comment...
#include "quote.h" ...!includes auto-comment...
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "uniq" Line 39
#define AUTHORS \ Line 41
proper_name ("Richard M. Stallman"), \ Line 42
proper_name ("David MacKenzie") Line 43
#define SWAP_LINES(A, B) \ Line 45
do \ Line 46
{ \ Line 47
struct linebuffer *_tmp; \ Line 48
_tmp = (A); \ Line 49
(A) = (B); \ Line 50
(B) = _tmp; \ Line 51
} \ Line 52Block 1
while (0) Line 53
/* True if the LC_COLLATE locale is hard. */
static bool hard_LC_COLLATE; Line 56
/* Number of fields to skip on each line when doing comparisons. */
static size_t skip_fields; Line 59
/* Number of chars to skip after skipping any fields. */
static size_t skip_chars; Line 62
/* Number of chars to compare. */
static size_t check_chars; Line 65
enum countmode Line 67
{
count_occurrences, /* -c Print count before output lines. */ Line 69
count_none /* Default. Do not print counts. */ Line 70
}; Block 2
/* Whether and how to precede the output lines with a count of the number of
times they occurred in the input. */
static enum countmode countmode; Line 75
/* Which lines to output: unique lines, the first of a group of
repeated lines, and the second and subsequented of a group of
repeated lines. */
static bool output_unique; Line 80
static bool output_first_repeated; Line 81
static bool output_later_repeated; Line 82
/* If true, ignore case when comparing. */
static bool ignore_case; Line 85
enum delimit_method Line 87
{
/* No delimiters output. --all-repeated[=none] */
DM_NONE, Line 90
/* Delimiter precedes all groups. --all-repeated=prepend */
DM_PREPEND, Line 93
/* Delimit all groups. --all-repeated=separate */
DM_SEPARATE Line 96
};
static char const *const delimit_method_string[] = Line 99
{
"none", "prepend", "separate", NULL Line 101
}; Block 4
static enum delimit_method const delimit_method_map[] = Line 104
{
DM_NONE, DM_PREPEND, DM_SEPARATE Line 106
}; Block 5
/* Select whether/how to delimit groups of duplicate lines. */
static enum delimit_method delimit_groups; Line 110
enum grouping_method Line 112
{
/* No grouping, when "--group" isn't used */
GM_NONE, Line 115
/* Delimiter preceges all groups. --group=prepend */
GM_PREPEND, Line 118
/* Delimiter follows all groups. --group=append */
GM_APPEND, Line 121
/* Delimiter between groups. --group[=separate] */
GM_SEPARATE, Line 124
/* Delimiter before and after each group. --group=both */
GM_BOTH Line 127
};
static char const *const grouping_method_string[] = Line 130
{
"prepend", "append", "separate", "both", NULL Line 132
}; Block 7
static enum grouping_method const grouping_method_map[] = Line 135
{
GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH Line 137
}; Block 8
static enum grouping_method grouping = GM_NONE; Line 140
enum Line 142
{
GROUP_OPTION = CHAR_MAX + 1 Line 144
}; Block 9
static struct option const longopts[] = Line 147
{
{"count", no_argument, NULL, 'c'}, Line 149
{"repeated", no_argument, NULL, 'd'}, Line 150
{"all-repeated", optional_argument, NULL, 'D'}, Line 151
{"group", optional_argument, NULL, GROUP_OPTION}, Line 152
{"ignore-case", no_argument, NULL, 'i'}, Line 153
{"unique", no_argument, NULL, 'u'}, Line 154
{"skip-fields", required_argument, NULL, 'f'}, Line 155
{"skip-chars", required_argument, NULL, 's'}, Line 156
{"check-chars", required_argument, NULL, 'w'}, Line 157
{"zero-terminated", no_argument, NULL, 'z'}, Line 158
{GETOPT_HELP_OPTION_DECL}, Line 159
{GETOPT_VERSION_OPTION_DECL}, Line 160
{NULL, 0, NULL, 0} Line 161
}; Block 10
void Line 164
usage (int status) Line 165
{
if (status != EXIT_SUCCESS) Line 167
emit_try_help (); ...!common auto-comment...
else Line 169
{
printf (_("\ Line 171
Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\ Line 172
"), Line 173
program_name); Line 174
fputs (_("\ Line 175
Filter adjacent matching lines from INPUT (or standard input),\n\ Line 176
writing to OUTPUT (or standard output).\n\ Line 177
\n\
With no options, matching lines are merged to the first occurrence.\n\ Line 179
"), stdout); Line 180
emit_mandatory_arg_note (); ...!common auto-comment...
fputs (_("\ Line 184
-c, --count prefix lines by the number of occurrences\n\ Line 185
-d, --repeated only print duplicate lines, one for each group\n\ Line 186
"), stdout); Line 187
fputs (_("\ Line 188
-D print all duplicate lines\n\ Line 189
--all-repeated[=METHOD] like -D, but allow separating groups\n\ Line 190
with an empty line;\n\ Line 191
METHOD={none(default),prepend,separate}\n\ Line 192
"), stdout); Line 193
fputs (_("\ Line 194
-f, --skip-fields=N avoid comparing the first N fields\n\ Line 195
"), stdout); Line 196
fputs (_("\ Line 197
--group[=METHOD] show all items, separating groups with an empty line;\n\Line 198
METHOD={separate(default),prepend,append,both}\n\ Line 199
"), stdout); Line 200
fputs (_("\ Line 201
-i, --ignore-case ignore differences in case when comparing\n\ Line 202
-s, --skip-chars=N avoid comparing the first N characters\n\ Line 203
-u, --unique only print unique lines\n\ Line 204
"), stdout); Line 205
fputs (_("\ Line 206
-z, --zero-terminated line delimiter is NUL, not newline\n\ Line 207
"), stdout); Line 208
fputs (_("\ Line 209
-w, --check-chars=N compare no more than N characters in lines\n\ Line 210
"), stdout); Line 211
fputs (HELP_OPTION_DESCRIPTION, stdout); Line 212
fputs (VERSION_OPTION_DESCRIPTION, stdout); Line 213
fputs (_("\ Line 214
\n\
A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\ Line 216
characters. Fields are skipped before chars.\n\ Line 217
"), stdout); Line 218
fputs (_("\ Line 219
\n\
Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\ Line 221
You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\ Line 222
Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\ Line 223
"), stdout); Line 224
emit_ancillary_info (PROGRAM_NAME); Line 225
}
exit (status); Line 227
} Block 11
static bool Line 230
strict_posix2 (void) Line 231
{
int posix_ver = posix2_version (); Line 233
return 200112 <= posix_ver && posix_ver < 200809; Line 234
} Block 12
/* Convert OPT to size_t, reporting an error using MSGID if OPT is
invalid. Silently convert too-large values to SIZE_MAX. */
static size_t Line 240
size_opt (char const *opt, char const *msgid) Line 241
{
unsigned long int size; Line 243
verify (SIZE_MAX <= ULONG_MAX); Line 244
switch (xstrtoul (opt, NULL, 10, &size, "")) Line 246
{
case LONGINT_OK: Line 248
case LONGINT_OVERFLOW: Line 249
break; Line 250
default: Line 252
die (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid)); Line 253
}
return MIN (size, SIZE_MAX); Line 256
} Block 13
/* Given a linebuffer LINE,
return a pointer to the beginning of the line's field to be compared. */
static char * _GL_ATTRIBUTE_PURE Line 262
find_field (struct linebuffer const *line) Line 263
{
size_t count; Line 265
char const *lp = line->buffer; Line 266
size_t size = line->length - 1; Line 267
size_t i = 0; Line 268
for (count = 0; count < skip_fields && i < size; count++) Line 270
{
while (i < size && field_sep (lp[i])) Line 272
i++; Line 273
while (i < size && !field_sep (lp[i])) Line 274
i++; Line 275
}
i += MIN (skip_chars, size - i); Line 278
return line->buffer + i; Line 280
} Block 14
/* Return false if two strings OLD and NEW match, true if not.
OLD and NEW point not to the beginnings of the lines
but rather to the beginnings of the fields to compare.
OLDLEN and NEWLEN are their lengths. */
static bool Line 288
different (char *old, char *new, size_t oldlen, size_t newlen) Line 289
{
if (check_chars < oldlen) Line 291
oldlen = check_chars; Line 292
if (check_chars < newlen) Line 293
newlen = check_chars; Line 294
if (ignore_case) Line 296
{
/* FIXME: This should invoke strcoll somehow. */
return oldlen != newlen || memcasecmp (old, new, oldlen); Line 299
}
else if (hard_LC_COLLATE) Line 301
return xmemcoll (old, oldlen, new, newlen) != 0; Line 302
else Line 303
return oldlen != newlen || memcmp (old, new, oldlen); Line 304
} Block 15
/* Output the line in linebuffer LINE to standard output
provided that the switches say it should be output.
MATCH is true if the line matches the previous line.
If requested, print the number of times it occurred, as well;
LINECOUNT + 1 is the number of times that the line occurred. */
static void Line 313
writeline (struct linebuffer const *line, Line 314
bool match, uintmax_t linecount) Line 315
{
if (! (linecount == 0 ? output_unique Line 317
: !match ? output_first_repeated Line 318
: output_later_repeated)) Line 319
return; Line 320
if (countmode == count_occurrences) Line 322
printf ("%7" PRIuMAX " ", linecount + 1); Line 323
fwrite (line->buffer, sizeof (char), line->length, stdout); Line 325...!syscalls auto-comment...
} Block 16
/* Process input file INFILE with output to OUTFILE.
If either is "-", use the standard I/O stream for it instead. */
static void Line 331
check_file (const char *infile, const char *outfile, char delimiter) Line 332
{
struct linebuffer lb1, lb2; Line 334
struct linebuffer *thisline, *prevline; Line 335
if (! (STREQ (infile, "-") || freopen (infile, "r", stdin))) Line 337...!syscalls auto-comment...
die (EXIT_FAILURE, errno, "%s", quotef (infile)); Line 338
if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout))) Line 339...!syscalls auto-comment...
die (EXIT_FAILURE, errno, "%s", quotef (outfile)); Line 340
fadvise (stdin, FADVISE_SEQUENTIAL); Line 342...!syscalls auto-comment...
thisline = &lb1; Line 344
prevline = &lb2; Line 345
initbuffer (thisline); Line 347
initbuffer (prevline); Line 348
/* The duplication in the following 'if' and 'else' blocks is an
optimization to distinguish between when we can print input
lines immediately (1. & 2.) or not.
1. --group => all input lines are printed.
checking for unique/duplicated lines is used only for printing
group separators.
2. The default case in which none of these options has been specified:
--count, --repeated, --all-repeated, --unique
In the default case, this optimization lets uniq output each different
line right away, without waiting to see if the next one is different.
3. All other cases.
*/
if (output_unique && output_first_repeated && countmode == count_none) Line 365
{
char *prevfield IF_LINT ( = NULL); Line 367
size_t prevlen IF_LINT ( = 0); Line 368
bool first_group_printed = false; Line 369
while (!feof (stdin)) Line 371
{
char *thisfield; Line 373
size_t thislen; Line 374
bool new_group; Line 375
if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) Line 377
break; Line 378
thisfield = find_field (thisline); Line 380
thislen = thisline->length - 1 - (thisfield - thisline->buffer); Line 381
new_group = (prevline->length == 0 Line 383
|| different (thisfield, prevfield, thislen, prevlen)); Line 384
if (new_group && grouping != GM_NONE Line 386
&& (grouping == GM_PREPEND || grouping == GM_BOTH Line 387
|| (first_group_printed && (grouping == GM_APPEND Line 388
|| grouping == GM_SEPARATE)))) Line 389
putchar (delimiter); Line 390
if (new_group || grouping != GM_NONE) Line 392
{
fwrite (thisline->buffer, sizeof (char), Line 394...!syscalls auto-comment...
thisline->length, stdout); Line 395
SWAP_LINES (prevline, thisline); Line 397
prevfield = thisfield; Line 398
prevlen = thislen; Line 399
first_group_printed = true; Line 400
}
}
if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)Line 403
putchar (delimiter); Line 404
}
else Line 406
{
char *prevfield; Line 408
size_t prevlen; Line 409
uintmax_t match_count = 0; Line 410
bool first_delimiter = true; Line 411
if (readlinebuffer_delim (prevline, stdin, delimiter) == 0) Line 413
goto closefiles; Line 414
prevfield = find_field (prevline); Line 415
prevlen = prevline->length - 1 - (prevfield - prevline->buffer); Line 416
while (!feof (stdin)) Line 418
{
bool match; Line 420
char *thisfield; Line 421
size_t thislen; Line 422
if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) Line 423
{
if (ferror (stdin)) Line 425
goto closefiles; Line 426
break; Line 427
}
thisfield = find_field (thisline); Line 429
thislen = thisline->length - 1 - (thisfield - thisline->buffer); Line 430
match = !different (thisfield, prevfield, thislen, prevlen); Line 431
match_count += match; Line 432
if (match_count == UINTMAX_MAX) Line 434
{
if (count_occurrences) Line 436
die (EXIT_FAILURE, 0, _("too many repeated lines")); Line 437
match_count--; Line 438
}
if (delimit_groups != DM_NONE) Line 441
{
if (!match) Line 443
{
if (match_count) /* a previous match */ Line 445
first_delimiter = false; /* Only used when DM_SEPARATE */ Line 446
}
else if (match_count == 1) Line 448
{
if ((delimit_groups == DM_PREPEND) Line 450
|| (delimit_groups == DM_SEPARATE Line 451
&& !first_delimiter)) Line 452
putchar (delimiter); Line 453
}
}
if (!match || output_later_repeated) Line 457
{
writeline (prevline, match, match_count); Line 459
SWAP_LINES (prevline, thisline); Line 460
prevfield = thisfield; Line 461
prevlen = thislen; Line 462
if (!match) Line 463
match_count = 0; Line 464
}
}
writeline (prevline, false, match_count); Line 468
}
closefiles: Line 471
if (ferror (stdin) || fclose (stdin) != 0) Line 472...!syscalls auto-comment...
die (EXIT_FAILURE, 0, _("error reading %s"), quoteaf (infile)); Line 473
/* stdout is handled via the atexit-invoked close_stdout function. */
free (lb1.buffer); Line 477
free (lb2.buffer); Line 478
} Block 17
enum Skip_field_option_type Line 481
{
SFO_NONE, Line 483
SFO_OBSOLETE, Line 484
SFO_NEW Line 485
}; Block 18
int
main (int argc, char **argv) Line 489
{
int optc = 0; Line 491
bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); Line 492
enum Skip_field_option_type skip_field_option_type = SFO_NONE; Line 493
unsigned int nfiles = 0; Line 494
char const *file[2]; Line 495
char delimiter = '\n'; /* change with --zero-terminated, -z */ Line 496
bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */ Line 497
file[0] = file[1] = "-"; Line 499
initialize_main (&argc, &argv); VMS-specific entry point handling wildcard expansion
set_program_name (argv[0]); Retains program name and discards path
setlocale (LC_ALL, ""); Sets up internationalization (i18n)
bindtextdomain (PACKAGE, LOCALEDIR); Assigns i18n directorySets text domain for _() [gettext()] function
textdomain (PACKAGE); Sets text domain for _() [gettext()] function
hard_LC_COLLATE = hard_locale (LC_COLLATE); Line 505
atexit (close_stdout); Close stdout on exit (see gnulib)
skip_chars = 0; Line 509
skip_fields = 0; Line 510
check_chars = SIZE_MAX; Line 511
output_unique = output_first_repeated = true; Line 512
output_later_repeated = false; Line 513
countmode = count_none; Line 514
delimit_groups = DM_NONE; Line 515
while (true) Line 517
{
/* Parse an operand with leading "+" as a file after "--" was
seen; or if pedantic and a file was seen; or if not
obsolete. */
if (optc == -1 Line 523
|| (posixly_correct && nfiles != 0) Line 524
|| ((optc = getopt_long (argc, argv, Line 525
"-0123456789Dcdf:is:uw:z", longopts, NULL)) Line 526
== -1)) Line 527
{
if (argc <= optind) Line 529
break; Line 530
if (nfiles == 2) Line 531
{
error (0, 0, _("extra operand %s"), quote (argv[optind])); Line 533
usage (EXIT_FAILURE); Line 534
}
file[nfiles++] = argv[optind++]; Line 536
}
else switch (optc) Line 538
{
case 1: Line 540
{
unsigned long int size; Line 542
if (optarg[0] == '+' Line 543
&& ! strict_posix2 () Line 544
&& xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK Line 545
&& size <= SIZE_MAX) Line 546
skip_chars = size; Line 547
else if (nfiles == 2) Line 548
{
error (0, 0, _("extra operand %s"), quote (optarg)); Line 550
usage (EXIT_FAILURE); Line 551
}
else Line 553
file[nfiles++] = optarg; Line 554
}
break; Line 556
case '0': Line 558
case '1': Line 559
case '2': Line 560
case '3': Line 561
case '4': Line 562
case '5': Line 563
case '6': Line 564
case '7': Line 565
case '8': Line 566
case '9': Line 567
{
if (skip_field_option_type == SFO_NEW) Line 569
skip_fields = 0; Line 570
if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t)) Line 572
skip_fields = SIZE_MAX; Line 573
skip_field_option_type = SFO_OBSOLETE; Line 575
}
break; Line 577
case 'c': Line 579
countmode = count_occurrences; Line 580
output_option_used = true; Line 581
break; Line 582
case 'd': Line 584
output_unique = false; Line 585
output_option_used = true; Line 586
break; Line 587
case 'D': Line 589
output_unique = false; Line 590
output_later_repeated = true; Line 591
if (optarg == NULL) Line 592
delimit_groups = DM_NONE; Line 593
else Line 594
delimit_groups = XARGMATCH ("--all-repeated", optarg, Line 595
delimit_method_string, Line 596
delimit_method_map); Line 597
output_option_used = true; Line 598
break; Line 599
case GROUP_OPTION: Line 601
if (optarg == NULL) Line 602
grouping = GM_SEPARATE; Line 603
else Line 604
grouping = XARGMATCH ("--group", optarg, Line 605
grouping_method_string, Line 606
grouping_method_map); Line 607
break; Line 608
case 'f': Line 610
skip_field_option_type = SFO_NEW; Line 611
skip_fields = size_opt (optarg, Line 612
N_("invalid number of fields to skip")); Line 613
break; Line 614
case 'i': Line 616
ignore_case = true; Line 617
break; Line 618
case 's': Line 620
skip_chars = size_opt (optarg, Line 621
N_("invalid number of bytes to skip")); Line 622
break; Line 623
case 'u': Line 625
output_first_repeated = false; Line 626
output_option_used = true; Line 627
break; Line 628
case 'w': Line 630
check_chars = size_opt (optarg, Line 631
N_("invalid number of bytes to compare")); Line 632
break; Line 633
case 'z': Line 635
delimiter = '\0'; Line 636
break; Line 637
case_GETOPT_HELP_CHAR; Line 639
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); Line 641
default: Line 643
usage (EXIT_FAILURE); Line 644
}
}
/* Note we could allow --group with -D at least, and that would
avoid the need to specify a grouping method to --all-repeated.
It was thought best to avoid deprecating those parameters though
and keep --group separate to other options. */
if (grouping != GM_NONE && output_option_used) Line 652
{
error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u")); Line 654
usage (EXIT_FAILURE); Line 655
}
if (grouping != GM_NONE && countmode != count_none) Line 658
{
error (0, 0, Line 660
_("grouping and printing repeat counts is meaningless")); Line 661
usage (EXIT_FAILURE); Line 662
}
if (countmode == count_occurrences && output_later_repeated) Line 665
{
error (0, 0, Line 667
_("printing all duplicated lines and repeat counts is meaningless"));Line 668
usage (EXIT_FAILURE); Line 669
}
check_file (file[0], file[1], delimiter); Line 672
return EXIT_SUCCESS; Line 674
} Block 19