/* tr -- a filter to translate characters This is the tr utility
Copyright (C) 1991-2018 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */ The GNUv3 license
/* Written by Jim Meyering */
#include <config.h> Provides system specific information
#include <stdio.h> Provides standard I/O capability
#include <assert.h> ...!includes auto-comment...
#include <sys/types.h> Provides system data types
#include <getopt.h> ...!includes auto-comment...
#include "system.h" ...!includes auto-comment...
#include "die.h" ...!includes auto-comment...
#include "error.h" ...!includes auto-comment...
#include "fadvise.h" ...!includes auto-comment...
#include "quote.h" ...!includes auto-comment...
#include "safe-read.h" ...!includes auto-comment...
#include "xbinary-io.h" ...!includes auto-comment...
#include "xstrtol.h" ...!includes auto-comment...
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "tr" Line 36
#define AUTHORS proper_name ("Jim Meyering") Line 38
enum { N_CHARS = UCHAR_MAX + 1 }; Line 40
/* An unsigned integer type big enough to hold a repeat count or an
unsigned character. POSIX requires support for repeat counts as
high as 2**31 - 1. Since repeat counts might need to expand to
match the length of an argument string, we need at least size_t to
avoid arbitrary internal limits. It doesn't cost much to use
uintmax_t, though. */
typedef uintmax_t count; Line 48
/* The value for Spec_list->state that indicates to
get_next that it should initialize the tail pointer.
Its value should be as large as possible to avoid conflict
a valid value for the state field -- and that may be as
large as any valid repeat_count. */
#define BEGIN_STATE (UINTMAX_MAX - 1) Line 55
/* The value for Spec_list->state that indicates to
get_next that the element pointed to by Spec_list->tail is
being considered for the first time on this pass through the
list -- it indicates that get_next should make any necessary
initializations. */
#define NEW_ELEMENT (BEGIN_STATE + 1) Line 62
/* The maximum possible repeat count. Due to how the states are
implemented, it can be as much as BEGIN_STATE. */
#define REPEAT_COUNT_MAXIMUM BEGIN_STATE Line 66
/* The following (but not CC_NO_CLASS) are indices into the array of
valid character class strings. */
enum Char_class Line 70
{
CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3, Line 72
CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7, Line 73
CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11, Line 74
CC_NO_CLASS = 9999 Line 75
}; Block 2
/* Character class to which a character (returned by get_next) belonged;
but it is set only if the construct from which the character was obtained
was one of the character classes [:upper:] or [:lower:]. The value
is used only when translating and then, only to make sure that upper
and lower class constructs have the same relative positions in string1
and string2. */
enum Upper_Lower_class Line 84
{
UL_LOWER, Line 86
UL_UPPER, Line 87
UL_NONE Line 88
}; Block 3
/* The type of a List_element. See build_spec_list for more details. */
enum Range_element_type Line 92
{
RE_NORMAL_CHAR, Line 94
RE_RANGE, Line 95
RE_CHAR_CLASS, Line 96
RE_EQUIV_CLASS, Line 97
RE_REPEATED_CHAR Line 98
}; Block 4
/* One construct in one of tr's argument strings.
For example, consider the POSIX version of the classic tr command:
tr -cs 'a-zA-Z_' '[\n*]'
String1 has 3 constructs, two of which are ranges (a-z and A-Z),
and a single normal character, '_'. String2 has one construct. */
struct List_element Line 106
{
enum Range_element_type type; Line 108
struct List_element *next; Line 109
union Line 110
{
unsigned char normal_char; Line 112
struct /* unnamed */ Line 113
{
unsigned char first_char; Line 115
unsigned char last_char; Line 116
}
range; Line 118
enum Char_class char_class; Line 119
unsigned char equiv_code; Line 120
struct /* unnamed */ Line 121
{
unsigned char the_repeated_char; Line 123
count repeat_count; Line 124
}
repeated_char; Line 126
}
u;
}; Block 5
/* Each of tr's argument strings is parsed into a form that is easier
to work with: a linked list of constructs (struct List_element).
Each Spec_list structure also encapsulates various attributes of
the corresponding argument string. The attributes are used mainly
to verify that the strings are valid in the context of any options
specified (like -s, -d, or -c). The main exception is the member
'tail', which is first used to construct the list. After construction,
it is used by get_next to save its state when traversing the list.
The member 'state' serves a similar function. */
struct Spec_list Line 140
{
/* Points to the head of the list of range elements.
The first struct is a dummy; its members are never used. */
struct List_element *head; Line 144
/* When appending, points to the last element. When traversing via
get_next(), points to the element to process next. Setting
Spec_list.state to the value BEGIN_STATE before calling get_next
signals get_next to initialize tail to point to head->next. */
struct List_element *tail; Line 150
/* Used to save state between calls to get_next. */
count state; Line 153
/* Length, in the sense that length ('a-z[:digit:]123abc')
is 42 ( = 26 + 10 + 6). */
count length; Line 157
/* The number of [c*] and [c*0] constructs that appear in this spec. */
size_t n_indefinite_repeats; Line 160
/* If n_indefinite_repeats is nonzero, this points to the List_element
corresponding to the last [c*] or [c*0] construct encountered in
this spec. Otherwise it is undefined. */
struct List_element *indefinite_repeat_element; Line 165
/* True if this spec contains at least one equivalence
class construct e.g. [=c=]. */
bool has_equiv_class; Line 169
/* True if this spec contains at least one character class
construct. E.g. [:digit:]. */
bool has_char_class; Line 173
/* True if this spec contains at least one of the character class
constructs (all but upper and lower) that aren't allowed in s2. */
bool has_restricted_char_class; Line 177
};
/* A representation for escaped string1 or string2. As a string is parsed,
any backslash-escaped characters (other than octal or \a, \b, \f, \n,
etc.) are marked as such in this structure by setting the corresponding
entry in the ESCAPED vector. */
struct E_string Line 184
{
char *s; Line 186
bool *escaped; Line 187
size_t len; Line 188
}; Block 7
/* Return nonzero if the Ith character of escaped string ES matches C
and is not escaped itself. */
static inline bool Line 193
es_match (struct E_string const *es, size_t i, char c) Line 194
{
return es->s[i] == c && !es->escaped[i]; Line 196
} Block 8
/* When true, each sequence in the input of a repeated character
(call it c) is replaced (in the output) by a single occurrence of c
for every c in the squeeze set. */
static bool squeeze_repeats = false; Line 202
/* When true, removes characters in the delete set from input. */
static bool delete = false; Line 205
/* Use the complement of set1 in place of set1. */
static bool complement = false; Line 208
/* When tr is performing translation and string1 is longer than string2,
POSIX says that the result is unspecified. That gives the implementor
of a POSIX conforming version of tr two reasonable choices for the
semantics of this case.
* The BSD tr pads string2 to the length of string1 by
repeating the last character in string2.
* System V tr ignores characters in string1 that have no
corresponding character in string2. That is, string1 is effectively
truncated to the length of string2.
When nonzero, this flag causes GNU tr to imitate the behavior
of System V tr when translating with string1 longer than string2.
The default is to emulate BSD tr. This flag is ignored in modes where
no translation is performed. Emulating the System V tr
in this exceptional case causes the relatively common BSD idiom:
tr -cs A-Za-z0-9 '\012'
to break (it would convert only zero bytes, rather than all
non-alphanumerics, to newlines).
WARNING: This switch does not provide general BSD or System V
compatibility. For example, it doesn't disable the interpretation
of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
some unfortunate coincidence you use such constructs in scripts
expecting to use some other version of tr, the scripts will break. */
static bool truncate_set1 = false; Line 238
/* An alias for (!delete && non_option_args == 2).
It is set in main and used there and in validate(). */
static bool translating; Line 242
static char io_buf[BUFSIZ]; Line 244
static char const *const char_class_name[] = Line 246
{
"alnum", "alpha", "blank", "cntrl", "digit", "graph", Line 248
"lower", "print", "punct", "space", "upper", "xdigit" Line 249
}; Block 9
/* Array of boolean values. A character 'c' is a member of the
squeeze set if and only if in_squeeze_set[c] is true. The squeeze
set is defined by the last (possibly, the only) string argument
on the command line when the squeeze option is given. */
static bool in_squeeze_set[N_CHARS]; Line 256
/* Array of boolean values. A character 'c' is a member of the
delete set if and only if in_delete_set[c] is true. The delete
set is defined by the first (or only) string argument on the
command line when the delete option is given. */
static bool in_delete_set[N_CHARS]; Line 262
/* Array of character values defining the translation (if any) that
tr is to perform. Translation is performed only when there are
two specification strings and the delete switch is not given. */
static char xlate[N_CHARS]; Line 267
static struct option const long_options[] = Line 269
{
{"complement", no_argument, NULL, 'c'}, Line 271
{"delete", no_argument, NULL, 'd'}, Line 272
{"squeeze-repeats", no_argument, NULL, 's'}, Line 273
{"truncate-set1", no_argument, NULL, 't'}, Line 274
{GETOPT_HELP_OPTION_DECL}, Line 275
{GETOPT_VERSION_OPTION_DECL}, Line 276
{NULL, 0, NULL, 0} Line 277
}; Block 10
void Line 280
usage (int status) Line 281
{
if (status != EXIT_SUCCESS) Line 283
emit_try_help (); ...!common auto-comment...
else Line 285
{
printf (_("\ Line 287
Usage: %s [OPTION]... SET1 [SET2]\n\ Line 288
"), Line 289
program_name); Line 290
fputs (_("\ Line 291
Translate, squeeze, and/or delete characters from standard input,\n\ Line 292
writing to standard output.\n\ Line 293
\n\
-c, -C, --complement use the complement of SET1\n\ Line 295
-d, --delete delete characters in SET1, do not translate\n\ Line 296
-s, --squeeze-repeats replace each sequence of a repeated character\n\ Line 297
that is listed in the last specified SET,\n\ Line 298
with a single occurrence of that character\n\ Line 299
-t, --truncate-set1 first truncate SET1 to length of SET2\n\ Line 300
"), stdout); Line 301
fputs (HELP_OPTION_DESCRIPTION, stdout); Line 302
fputs (VERSION_OPTION_DESCRIPTION, stdout); Line 303
fputs (_("\ Line 304
\n\
SETs are specified as strings of characters. Most represent themselves.\n\ Line 306
Interpreted sequences are:\n\ Line 307
\n\
\\NNN character with octal value NNN (1 to 3 octal digits)\n\ Line 309
\\\\ backslash\n\ Line 310
\\a audible BEL\n\ Line 311
\\b backspace\n\ Line 312
\\f form feed\n\ Line 313
\\n new line\n\ Line 314
\\r return\n\ Line 315
\\t horizontal tab\n\ Line 316
"), stdout); Line 317
fputs (_("\ Line 318
\\v vertical tab\n\ Line 319
CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\ Line 320
[CHAR*] in SET2, copies of CHAR until length of SET1\n\ Line 321
[CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\ Line 322
[:alnum:] all letters and digits\n\ Line 323
[:alpha:] all letters\n\ Line 324
[:blank:] all horizontal whitespace\n\ Line 325
[:cntrl:] all control characters\n\ Line 326
[:digit:] all digits\n\ Line 327
"), stdout); Line 328
fputs (_("\ Line 329
[:graph:] all printable characters, not including space\n\ Line 330
[:lower:] all lower case letters\n\ Line 331
[:print:] all printable characters, including space\n\ Line 332
[:punct:] all punctuation characters\n\ Line 333
[:space:] all horizontal or vertical whitespace\n\ Line 334
[:upper:] all upper case letters\n\ Line 335
[:xdigit:] all hexadecimal digits\n\ Line 336
[=CHAR=] all characters which are equivalent to CHAR\n\ Line 337
"), stdout); Line 338
fputs (_("\ Line 339
\n\
Translation occurs if -d is not given and both SET1 and SET2 appear.\n\ Line 341
-t may be used only when translating. SET2 is extended to length of\n\ Line 342
SET1 by repeating its last character as necessary. Excess characters\n\ Line 343
of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\ Line 344
expand in ascending order; used in SET2 while translating, they may\n\ Line 345
only be used in pairs to specify case conversion. -s uses the last\n\ Line 346
specified SET, and occurs after translation or deletion.\n\ Line 347
"), stdout); Line 348
emit_ancillary_info (PROGRAM_NAME); Line 349
}
exit (status); Line 351
} Block 11
/* Return nonzero if the character C is a member of the
equivalence class containing the character EQUIV_CLASS. */
static inline bool Line 357
is_equiv_class_member (unsigned char equiv_class, unsigned char c) Line 358
{
return (equiv_class == c); Line 360
} Block 12
/* Return true if the character C is a member of the
character class CHAR_CLASS. */
static bool _GL_ATTRIBUTE_PURE Line 366
is_char_class_member (enum Char_class char_class, unsigned char c) Line 367
{
int result; Line 369
switch (char_class) Line 371
{
case CC_ALNUM: Line 373
result = isalnum (c); Line 374
break; Line 375
case CC_ALPHA: Line 376
result = isalpha (c); Line 377
break; Line 378
case CC_BLANK: Line 379
result = isblank (c); Line 380
break; Line 381
case CC_CNTRL: Line 382
result = iscntrl (c); Line 383
break; Line 384
case CC_DIGIT: Line 385
result = isdigit (c); Line 386
break; Line 387
case CC_GRAPH: Line 388
result = isgraph (c); Line 389
break; Line 390
case CC_LOWER: Line 391
result = islower (c); Line 392
break; Line 393
case CC_PRINT: Line 394
result = isprint (c); Line 395
break; Line 396
case CC_PUNCT: Line 397
result = ispunct (c); Line 398
break; Line 399
case CC_SPACE: Line 400
result = isspace (c); Line 401
break; Line 402
case CC_UPPER: Line 403
result = isupper (c); Line 404
break; Line 405
case CC_XDIGIT: Line 406
result = isxdigit (c); Line 407
break; Line 408
default: Line 409
abort (); ...!common auto-comment...
}
return !! result; Line 413
} Block 13
static void Line 416
es_free (struct E_string *es) Line 417
{
free (es->s); Line 419
free (es->escaped); Line 420
} Block 14
/* Perform the first pass over each range-spec argument S, converting all
\c and \ddd escapes to their one-byte representations. If an invalid
quote sequence is found print an error message and return false;
Otherwise set *ES to the resulting string and return true.
The resulting array of characters may contain zero-bytes;
however, on input, S is assumed to be null-terminated, and hence
cannot contain actual (non-escaped) zero bytes. */
static bool Line 431
unquote (char const *s, struct E_string *es) Line 432
{
size_t len = strlen (s); Line 434
es->s = xmalloc (len); Line 436
es->escaped = xcalloc (len, sizeof es->escaped[0]); Line 437
unsigned int j = 0; Line 439
for (unsigned int i = 0; s[i]; i++) Line 440
{
unsigned char c; Line 442
int oct_digit; Line 443
switch (s[i]) Line 445
{
case '\\': Line 447
es->escaped[j] = true; Line 448
switch (s[i + 1]) Line 449
{
case '\\': Line 451
c = '\\'; Line 452
break; Line 453
case 'a': Line 454
c = '\a'; Line 455
break; Line 456
case 'b': Line 457
c = '\b'; Line 458
break; Line 459
case 'f': Line 460
c = '\f'; Line 461
break; Line 462
case 'n': Line 463
c = '\n'; Line 464
break; Line 465
case 'r': Line 466
c = '\r'; Line 467
break; Line 468
case 't': Line 469
c = '\t'; Line 470
break; Line 471
case 'v': Line 472
c = '\v'; Line 473
break; Line 474
case '0': Line 475
case '1': Line 476
case '2': Line 477
case '3': Line 478
case '4': Line 479
case '5': Line 480
case '6': Line 481
case '7': Line 482
c = s[i + 1] - '0'; Line 483
oct_digit = s[i + 2] - '0'; Line 484
if (0 <= oct_digit && oct_digit <= 7) Line 485
{
c = 8 * c + oct_digit; Line 487
++i; Line 488
oct_digit = s[i + 2] - '0'; Line 489
if (0 <= oct_digit && oct_digit <= 7) Line 490
{
if (8 * c + oct_digit < N_CHARS) Line 492
{
c = 8 * c + oct_digit; Line 494
++i; Line 495
}
else Line 497
{
/* A 3-digit octal number larger than \377 won't
fit in 8 bits. So we stop when adding the
next digit would put us over the limit and
give a warning about the ambiguity. POSIX
isn't clear on this, and we interpret this
lack of clarity as meaning the resulting behavior
is undefined, which means we're allowed to issue
a warning. */
error (0, 0, _("warning: the ambiguous octal escape\ Line 507
\\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, %c"), Line 508
s[i], s[i + 1], s[i + 2], Line 509
s[i], s[i + 1], s[i + 2]); Line 510
}
}
}
break; Line 514
case '\0': Line 515
error (0, 0, _("warning: an unescaped backslash " Line 516
"at end of string is not portable")); Line 517
/* POSIX is not clear about this. */
es->escaped[j] = false; Line 519
i--; Line 520
c = '\\'; Line 521
break; Line 522
default: Line 523
c = s[i + 1]; Line 524
break; Line 525
}
++i; Line 527
es->s[j++] = c; Line 528
break; Line 529
default: Line 530
es->s[j++] = s[i]; Line 531
break; Line 532
}
}
es->len = j; Line 535
return true; Line 536
} Block 15
/* If CLASS_STR is a valid character class string, return its index
in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
static enum Char_class _GL_ATTRIBUTE_PURE Line 542
look_up_char_class (char const *class_str, size_t len) Line 543
{
enum Char_class i; Line 545
for (i = 0; i < ARRAY_CARDINALITY (char_class_name); i++) Line 547
if (STREQ_LEN (class_str, char_class_name[i], len) Line 548
&& strlen (char_class_name[i]) == len) Line 549
return i; Line 550
return CC_NO_CLASS; Line 551
} Block 16
/* Return a newly allocated string with a printable version of C.
This function is used solely for formatting error messages. */
static char * Line 557
make_printable_char (unsigned char c) Line 558
{
char *buf = xmalloc (5); Line 560
if (isprint (c)) Line 562
{
buf[0] = c; Line 564
buf[1] = '\0'; Line 565
}
else Line 567
{
sprintf (buf, "\\%03o", c); Line 569
}
return buf; Line 571
} Block 17
/* Return a newly allocated copy of S which is suitable for printing.
LEN is the number of characters in S. Most non-printing
(isprint) characters are represented by a backslash followed by
3 octal digits. However, the characters represented by \c escapes
where c is one of [abfnrtv] are represented by their 2-character \c
sequences. This function is used solely for printing error messages. */
static char * Line 581
make_printable_str (char const *s, size_t len) Line 582
{
/* Worst case is that every character expands to a backslash
followed by a 3-character octal escape sequence. */
char *printable_buf = xnmalloc (len + 1, 4); Line 586
char *p = printable_buf; Line 587
for (size_t i = 0; i < len; i++) Line 589
{
char buf[5]; Line 591
char const *tmp = NULL; Line 592
unsigned char c = s[i]; Line 593
switch (c) Line 595
{
case '\\': Line 597
tmp = "\\"; Line 598
break; Line 599
case '\a': Line 600
tmp = "\\a"; Line 601
break; Line 602
case '\b': Line 603
tmp = "\\b"; Line 604
break; Line 605
case '\f': Line 606
tmp = "\\f"; Line 607
break; Line 608
case '\n': Line 609
tmp = "\\n"; Line 610
break; Line 611
case '\r': Line 612
tmp = "\\r"; Line 613
break; Line 614
case '\t': Line 615
tmp = "\\t"; Line 616
break; Line 617
case '\v': Line 618
tmp = "\\v"; Line 619
break; Line 620
default: Line 621
if (isprint (c)) Line 622
{
buf[0] = c; Line 624
buf[1] = '\0'; Line 625
}
else Line 627
sprintf (buf, "\\%03o", c); Line 628
tmp = buf; Line 629
break; Line 630
}
p = stpcpy (p, tmp); Line 632
}
return printable_buf; Line 634
}
/* Append a newly allocated structure representing a
character C to the specification list LIST. */
static void Line 640
append_normal_char (struct Spec_list *list, unsigned char c) Line 641
{
struct List_element *new = xmalloc (sizeof *new); Line 643
new->next = NULL; Line 644
new->type = RE_NORMAL_CHAR; Line 645
new->u.normal_char = c; Line 646
assert (list->tail); Line 647
list->tail->next = new; Line 648
list->tail = new; Line 649
} Block 19
/* Append a newly allocated structure representing the range
of characters from FIRST to LAST to the specification list LIST.
Return false if LAST precedes FIRST in the collating sequence,
true otherwise. This means that '[c-c]' is acceptable. */
static bool Line 657
append_range (struct Spec_list *list, unsigned char first, unsigned char last) Line 658
{
if (last < first) Line 660
{
char *tmp1 = make_printable_char (first); Line 662
char *tmp2 = make_printable_char (last); Line 663
error (0, 0, Line 665
_("range-endpoints of '%s-%s' are in reverse collating sequence order"), Line 666
tmp1, tmp2); Line 667
free (tmp1); Line 668
free (tmp2); Line 669
return false; Line 670
}
struct List_element *new = xmalloc (sizeof *new); Line 672
new->next = NULL; Line 673
new->type = RE_RANGE; Line 674
new->u.range.first_char = first; Line 675
new->u.range.last_char = last; Line 676
assert (list->tail); Line 677
list->tail->next = new; Line 678
list->tail = new; Line 679
return true; Line 680
} Block 20
/* If CHAR_CLASS_STR is a valid character class string, append a
newly allocated structure representing that character class to the end
of the specification list LIST and return true. If CHAR_CLASS_STR is not
a valid string return false. */
static bool Line 688
append_char_class (struct Spec_list *list, Line 689
char const *char_class_str, size_t len) Line 690
{
enum Char_class char_class = look_up_char_class (char_class_str, len); Line 692
if (char_class == CC_NO_CLASS) Line 693
return false; Line 694
struct List_element *new = xmalloc (sizeof *new); Line 695
new->next = NULL; Line 696
new->type = RE_CHAR_CLASS; Line 697
new->u.char_class = char_class; Line 698
assert (list->tail); Line 699
list->tail->next = new; Line 700
list->tail = new; Line 701
return true; Line 702
} Block 21
/* Append a newly allocated structure representing a [c*n]
repeated character construct to the specification list LIST.
THE_CHAR is the single character to be repeated, and REPEAT_COUNT
is a non-negative repeat count. */
static void Line 710
append_repeated_char (struct Spec_list *list, unsigned char the_char, Line 711
count repeat_count) Line 712
{
struct List_element *new = xmalloc (sizeof *new); Line 714
new->next = NULL; Line 715
new->type = RE_REPEATED_CHAR; Line 716
new->u.repeated_char.the_repeated_char = the_char; Line 717
new->u.repeated_char.repeat_count = repeat_count; Line 718
assert (list->tail); Line 719
list->tail->next = new; Line 720
list->tail = new; Line 721
} Block 22
/* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
the length of that string, LEN, if LEN is exactly one, append
a newly allocated structure representing the specified
equivalence class to the specification list, LIST and return true.
If LEN is not 1, return false. */
static bool Line 730
append_equiv_class (struct Spec_list *list, Line 731
char const *equiv_class_str, size_t len) Line 732
{
if (len != 1) Line 734
return false; Line 735
struct List_element *new = xmalloc (sizeof *new); Line 737
new->next = NULL; Line 738
new->type = RE_EQUIV_CLASS; Line 739
new->u.equiv_code = *equiv_class_str; Line 740
assert (list->tail); Line 741
list->tail->next = new; Line 742
list->tail = new; Line 743
return true; Line 744
} Block 23
/* Search forward starting at START_IDX for the 2-char sequence
(PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
a sequence is found, set *RESULT_IDX to the index of the first
character and return true. Otherwise return false. P may contain
zero bytes. */
static bool Line 753
find_closing_delim (const struct E_string *es, size_t start_idx, Line 754
char pre_bracket_char, size_t *result_idx) Line 755
{
for (size_t i = start_idx; i < es->len - 1; i++) Line 757
if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']' Line 758
&& !es->escaped[i] && !es->escaped[i + 1]) Line 759
{
*result_idx = i; Line 761
return true; Line 762
}
return false; Line 764
} Block 24
/* Parse the bracketed repeat-char syntax. If the P_LEN characters
beginning with P[ START_IDX ] comprise a valid [c*n] construct,
then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
and return zero. If the second character following
the opening bracket is not '*' or if no closing bracket can be
found, return -1. If a closing bracket is found and the
second char is '*', but the string between the '*' and ']' isn't
empty, an octal number, or a decimal number, print an error message
and return -2. */
static int Line 777
find_bracketed_repeat (const struct E_string *es, size_t start_idx, Line 778
unsigned char *char_to_repeat, count *repeat_count, Line 779
size_t *closing_bracket_idx) Line 780
{
assert (start_idx + 1 < es->len); Line 782
if (!es_match (es, start_idx + 1, '*')) Line 783
return -1; Line 784
for (size_t i = start_idx + 2; i < es->len && !es->escaped[i]; i++) Line 786
{
if (es->s[i] == ']') Line 788
{
size_t digit_str_len = i - start_idx - 2; Line 790
*char_to_repeat = es->s[start_idx]; Line 792
if (digit_str_len == 0) Line 793
{
/* We've matched [c*] -- no explicit repeat count. */
*repeat_count = 0; Line 796
}
else Line 798
{
/* Here, we have found [c*s] where s should be a string
of octal (if it starts with '0') or decimal digits. */
char const *digit_str = &es->s[start_idx + 2]; Line 802
char *d_end; Line 803
if ((xstrtoumax (digit_str, &d_end, *digit_str == '0' ? 8 : 10, Line 804
repeat_count, NULL) Line 805
!= LONGINT_OK) Line 806
|| REPEAT_COUNT_MAXIMUM < *repeat_count Line 807
|| digit_str + digit_str_len != d_end) Line 808
{
char *tmp = make_printable_str (digit_str, digit_str_len); Line 810
error (0, 0, Line 811
_("invalid repeat count %s in [c*n] construct"), Line 812
quote (tmp)); Line 813
free (tmp); Line 814
return -2; Line 815
}
}
*closing_bracket_idx = i; Line 818
return 0; Line 819
}
}
return -1; /* No bracket found. */ Line 822
} Block 25
/* Return true if the string at ES->s[IDX] matches the regular
expression '\*[0-9]*\]', false otherwise. The string does not
match if any of its characters are escaped. */
static bool _GL_ATTRIBUTE_PURE Line 829
star_digits_closebracket (const struct E_string *es, size_t idx) Line 830
{
if (!es_match (es, idx, '*')) Line 832
return false; Line 833
for (size_t i = idx + 1; i < es->len; i++) Line 835
if (!ISDIGIT (to_uchar (es->s[i])) || es->escaped[i]) Line 836
return es_match (es, i, ']'); Line 837
return false; Line 838
} Block 26
/* Convert string UNESCAPED_STRING (which has been preprocessed to
convert backslash-escape sequences) of length LEN characters into
a linked list of the following 5 types of constructs:
- [:str:] Character class where 'str' is one of the 12 valid strings.
- [=c=] Equivalence class where 'c' is any single character.
- [c*n] Repeat the single character 'c' 'n' times. n may be omitted.
However, if 'n' is present, it must be a non-negative octal or
decimal integer.
- r-s Range of characters from 'r' to 's'. The second endpoint must
not precede the first in the current collating sequence.
- c Any other character is interpreted as itself. */
static bool Line 853
build_spec_list (const struct E_string *es, struct Spec_list *result) Line 854
{
char const *p = es->s; Line 856
/* The main for-loop below recognizes the 4 multi-character constructs.
A character that matches (in its context) none of the multi-character
constructs is classified as 'normal'. Since all multi-character
constructs have at least 3 characters, any strings of length 2 or
less are composed solely of normal characters. Hence, the index of
the outer for-loop runs only as far as LEN-2. */
size_t i; Line 864
for (i = 0; i + 2 < es->len; /* empty */) Line 865
{
if (es_match (es, i, '[')) Line 867
{
bool matched_multi_char_construct; Line 869
size_t closing_bracket_idx; Line 870
unsigned char char_to_repeat; Line 871
count repeat_count; Line 872
int err; Line 873
matched_multi_char_construct = true; Line 875
if (es_match (es, i + 1, ':') || es_match (es, i + 1, '=')) Line 876
{
size_t closing_delim_idx; Line 878
if (find_closing_delim (es, i + 2, p[i + 1], &closing_delim_idx)) Line 880
{
size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1; Line 882
char const *opnd_str = p + i + 2; Line 883
if (opnd_str_len == 0) Line 885
{
if (p[i + 1] == ':') Line 887
error (0, 0, _("missing character class name '[::]'")); Line 888
else Line 889
error (0, 0, Line 890
_("missing equivalence class character '[==]'"));Line 891
return false; Line 892
}
if (p[i + 1] == ':') Line 895
{
/* FIXME: big comment. */
if (!append_char_class (result, opnd_str, opnd_str_len)) Line 898
{
if (star_digits_closebracket (es, i + 2)) Line 900
goto try_bracketed_repeat; Line 901
else Line 902
{
char *tmp = make_printable_str (opnd_str, Line 904
opnd_str_len); Line 905
error (0, 0, _("invalid character class %s"), Line 906
quote (tmp)); Line 907
free (tmp); Line 908
return false; Line 909
}
}
}
else Line 913
{
/* FIXME: big comment. */
if (!append_equiv_class (result, opnd_str, opnd_str_len)) Line 916
{
if (star_digits_closebracket (es, i + 2)) Line 918
goto try_bracketed_repeat; Line 919
else Line 920
{
char *tmp = make_printable_str (opnd_str, Line 922
opnd_str_len); Line 923
error (0, 0, Line 924
_("%s: equivalence class operand must be a single character"), Line 925
tmp); Line 926
free (tmp); Line 927
return false; Line 928
}
}
}
i = closing_delim_idx + 2; Line 933
continue; Line 934
}
/* Else fall through. This could be [:*] or [=*]. */
}
try_bracketed_repeat: Line 939
/* Determine whether this is a bracketed repeat range
matching the RE \[.\*(dec_or_oct_number)?\]. */
err = find_bracketed_repeat (es, i + 1, &char_to_repeat, Line 943
&repeat_count, Line 944
&closing_bracket_idx); Line 945
if (err == 0) Line 946
{
append_repeated_char (result, char_to_repeat, repeat_count); Line 948
i = closing_bracket_idx + 1; Line 949
}
else if (err == -1) Line 951
{
matched_multi_char_construct = false; Line 953
}
else Line 955
{
/* Found a string that looked like [c*n] but the
numeric part was invalid. */
return false; Line 959
}
if (matched_multi_char_construct) Line 962
continue; Line 963
/* We reach this point if P does not match [:str:], [=c=],
[c*n], or [c*]. Now, see if P looks like a range '[-c'
(from '[' to 'c'). */
}
/* Look ahead one char for ranges like a-z. */
if (es_match (es, i + 1, '-')) Line 971
{
if (!append_range (result, p[i], p[i + 2])) Line 973
return false; Line 974
i += 3; Line 975
}
else Line 977
{
append_normal_char (result, p[i]); Line 979
++i; Line 980
}
}
/* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
for (; i < es->len; i++) Line 985
append_normal_char (result, p[i]); Line 986
return true; Line 988
} Block 27
/* Advance past the current construct.
S->tail must be non-NULL. */
static void Line 993
skip_construct (struct Spec_list *s) Line 994
{
s->tail = s->tail->next; Line 996
s->state = NEW_ELEMENT; Line 997
} Block 28
/* Given a Spec_list S (with its saved state implicit in the values
of its members 'tail' and 'state'), return the next single character
in the expansion of S's constructs. If the last character of S was
returned on the previous call or if S was empty, this function
returns -1. For example, successive calls to get_next where S
represents the spec-string 'a-d[y*3]' will return the sequence
of values a, b, c, d, y, y, y, -1. Finally, if the construct from
which the returned character comes is [:upper:] or [:lower:], the
parameter CLASS is given a value to indicate which it was. Otherwise
CLASS is set to UL_NONE. This value is used only when constructing
the translation table to verify that any occurrences of upper and
lower class constructs in the spec-strings appear in the same relative
positions. */
static int Line 1014
get_next (struct Spec_list *s, enum Upper_Lower_class *class) Line 1015
{
struct List_element *p; Line 1017
int return_val; Line 1018
int i; Line 1019
if (class) Line 1021
*class = UL_NONE; Line 1022
if (s->state == BEGIN_STATE) Line 1024
{
s->tail = s->head->next; Line 1026
s->state = NEW_ELEMENT; Line 1027
}
p = s->tail; Line 1030
if (p == NULL) Line 1031
return -1; Line 1032
switch (p->type) Line 1034
{
case RE_NORMAL_CHAR: Line 1036
return_val = p->u.normal_char; Line 1037
s->state = NEW_ELEMENT; Line 1038
s->tail = p->next; Line 1039
break; Line 1040
case RE_RANGE: Line 1042
if (s->state == NEW_ELEMENT) Line 1043
s->state = p->u.range.first_char; Line 1044
else Line 1045
++(s->state); Line 1046
return_val = s->state; Line 1047
if (s->state == p->u.range.last_char) Line 1048
{
s->tail = p->next; Line 1050
s->state = NEW_ELEMENT; Line 1051
}
break; Line 1053
case RE_CHAR_CLASS: Line 1055
if (class) Line 1056
{
switch (p->u.char_class) Line 1058
{
case CC_LOWER: Line 1060
*class = UL_LOWER; Line 1061
break; Line 1062
case CC_UPPER: Line 1063
*class = UL_UPPER; Line 1064
break; Line 1065
default: Line 1066
break; Line 1067
}
}
if (s->state == NEW_ELEMENT) Line 1071
{
for (i = 0; i < N_CHARS; i++) Line 1073
if (is_char_class_member (p->u.char_class, i)) Line 1074
break; Line 1075
assert (i < N_CHARS); Line 1076
s->state = i; Line 1077
}
assert (is_char_class_member (p->u.char_class, s->state)); Line 1079
return_val = s->state; Line 1080
for (i = s->state + 1; i < N_CHARS; i++) Line 1081
if (is_char_class_member (p->u.char_class, i)) Line 1082
break; Line 1083
if (i < N_CHARS) Line 1084
s->state = i; Line 1085
else Line 1086
{
s->tail = p->next; Line 1088
s->state = NEW_ELEMENT; Line 1089
}
break; Line 1091
case RE_EQUIV_CLASS: Line 1093
/* FIXME: this assumes that each character is alone in its own
equivalence class (which appears to be correct for my
LC_COLLATE. But I don't know of any function that allows
one to determine a character's equivalence class. */
return_val = p->u.equiv_code; Line 1099
s->state = NEW_ELEMENT; Line 1100
s->tail = p->next; Line 1101
break; Line 1102
case RE_REPEATED_CHAR: Line 1104
/* Here, a repeat count of n == 0 means don't repeat at all. */
if (p->u.repeated_char.repeat_count == 0) Line 1106
{
s->tail = p->next; Line 1108
s->state = NEW_ELEMENT; Line 1109
return_val = get_next (s, class); Line 1110
}
else Line 1112
{
if (s->state == NEW_ELEMENT) Line 1114
{
s->state = 0; Line 1116
}
++(s->state); Line 1118
return_val = p->u.repeated_char.the_repeated_char; Line 1119
if (s->state == p->u.repeated_char.repeat_count) Line 1120
{
s->tail = p->next; Line 1122
s->state = NEW_ELEMENT; Line 1123
}
}
break; Line 1126
default: Line 1128
abort (); ...!common auto-comment...
}
return return_val; Line 1132
} Block 29
/* This is a minor kludge. This function is called from
get_spec_stats to determine the cardinality of a set derived
from a complemented string. It's a kludge in that some of the
same operations are (duplicated) performed in set_initialize. */
static int Line 1140
card_of_complement (struct Spec_list *s) Line 1141
{
int c; Line 1143
int cardinality = N_CHARS; Line 1144
bool in_set[N_CHARS] = { 0, }; Line 1145
s->state = BEGIN_STATE; Line 1147
while ((c = get_next (s, NULL)) != -1) Line 1148
{
cardinality -= (!in_set[c]); Line 1150
in_set[c] = true; Line 1151
}
return cardinality; Line 1153
} Block 30
/* Discard the lengths associated with a case conversion,
as using the actual number of upper or lower case characters
is problematic when they don't match in some locales.
Also ensure the case conversion classes in string2 are
aligned correctly with those in string1.
Note POSIX says the behavior of 'tr "[:upper:]" "[:upper:]"'
is undefined. Therefore we allow it (unlike Solaris)
and treat it as a no-op. */
static void Line 1165
validate_case_classes (struct Spec_list *s1, struct Spec_list *s2) Line 1166
{
size_t n_upper = 0; Line 1168
size_t n_lower = 0; Line 1169
int c1 = 0; Line 1170
int c2 = 0; Line 1171
count old_s1_len = s1->length; Line 1172
count old_s2_len = s2->length; Line 1173
struct List_element *s1_tail = s1->tail; Line 1174
struct List_element *s2_tail = s2->tail; Line 1175
bool s1_new_element = true; Line 1176
bool s2_new_element = true; Line 1177
if (!s2->has_char_class) Line 1179
return; Line 1180
for (int i = 0; i < N_CHARS; i++) Line 1182
{
if (isupper (i)) Line 1184
n_upper++; Line 1185
if (islower (i)) Line 1186
n_lower++; Line 1187
}
s1->state = BEGIN_STATE; Line 1190
s2->state = BEGIN_STATE; Line 1191
while (c1 != -1 && c2 != -1) Line 1193
{
enum Upper_Lower_class class_s1, class_s2; Line 1195
c1 = get_next (s1, &class_s1); Line 1197
c2 = get_next (s2, &class_s2); Line 1198
/* If c2 transitions to a new case class, then
c1 must also transition at the same time. */
if (s2_new_element && class_s2 != UL_NONE Line 1202
&& !(s1_new_element && class_s1 != UL_NONE)) Line 1203
die (EXIT_FAILURE, 0, Line 1204
_("misaligned [:upper:] and/or [:lower:] construct")); Line 1205
/* If case converting, quickly skip over the elements. */
if (class_s2 != UL_NONE) Line 1208
{
skip_construct (s1); Line 1210
skip_construct (s2); Line 1211
/* Discount insignificant/problematic lengths. */
s1->length -= (class_s1 == UL_UPPER ? n_upper : n_lower) - 1; Line 1213
s2->length -= (class_s2 == UL_UPPER ? n_upper : n_lower) - 1; Line 1214
}
s1_new_element = s1->state == NEW_ELEMENT; /* Next element is new. */ Line 1217
s2_new_element = s2->state == NEW_ELEMENT; /* Next element is new. */ Line 1218
}
assert (old_s1_len >= s1->length && old_s2_len >= s2->length); Line 1221
s1->tail = s1_tail; Line 1223
s2->tail = s2_tail; Line 1224
} Block 31
/* Gather statistics about the spec-list S in preparation for the tests
in validate that determine the consistency of the specs. This function
is called at most twice; once for string1, and again for any string2.
LEN_S1 < 0 indicates that this is the first call and that S represents
string1. When LEN_S1 >= 0, it is the length of the expansion of the
constructs in string1, and we can use its value to resolve any
indefinite repeat construct in S (which represents string2). Hence,
this function has the side-effect that it converts a valid [c*]
construct in string2 to [c*n] where n is large enough (or 0) to give
string2 the same length as string1. For example, with the command
tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
static void Line 1240
get_spec_stats (struct Spec_list *s) Line 1241
{
struct List_element *p; Line 1243
count length = 0; Line 1244
s->n_indefinite_repeats = 0; Line 1246
s->has_equiv_class = false; Line 1247
s->has_restricted_char_class = false; Line 1248
s->has_char_class = false; Line 1249
for (p = s->head->next; p; p = p->next) Line 1250
{
count len = 0; Line 1252
count new_length; Line 1253
switch (p->type) Line 1255
{
case RE_NORMAL_CHAR: Line 1257
len = 1; Line 1258
break; Line 1259
case RE_RANGE: Line 1261
assert (p->u.range.last_char >= p->u.range.first_char); Line 1262
len = p->u.range.last_char - p->u.range.first_char + 1; Line 1263
break; Line 1264
case RE_CHAR_CLASS: Line 1266
s->has_char_class = true; Line 1267
for (int i = 0; i < N_CHARS; i++) Line 1268
if (is_char_class_member (p->u.char_class, i)) Line 1269
++len; Line 1270
switch (p->u.char_class) Line 1271
{
case CC_UPPER: Line 1273
case CC_LOWER: Line 1274
break; Line 1275
default: Line 1276
s->has_restricted_char_class = true; Line 1277
break; Line 1278
}
break; Line 1280
case RE_EQUIV_CLASS: Line 1282
for (int i = 0; i < N_CHARS; i++) Line 1283
if (is_equiv_class_member (p->u.equiv_code, i)) Line 1284
++len; Line 1285
s->has_equiv_class = true; Line 1286
break; Line 1287
case RE_REPEATED_CHAR: Line 1289
if (p->u.repeated_char.repeat_count > 0) Line 1290
len = p->u.repeated_char.repeat_count; Line 1291
else Line 1292
{
s->indefinite_repeat_element = p; Line 1294
++(s->n_indefinite_repeats); Line 1295
}
break; Line 1297
default: Line 1299
abort (); ...!common auto-comment...
}
/* Check for arithmetic overflow in computing length. Also, reject
any length greater than the maximum repeat count, in case the
length is later used to compute the repeat count for an
indefinite element. */
new_length = length + len; Line 1307
if (! (length <= new_length && new_length <= REPEAT_COUNT_MAXIMUM)) Line 1308
die (EXIT_FAILURE, 0, _("too many characters in set")); Line 1309
length = new_length; Line 1310
}
s->length = length; Line 1313
} Block 32
static void Line 1316
get_s1_spec_stats (struct Spec_list *s1) Line 1317
{
get_spec_stats (s1); Line 1319
if (complement) Line 1320
s1->length = card_of_complement (s1); Line 1321
} Block 33
static void Line 1324
get_s2_spec_stats (struct Spec_list *s2, count len_s1) Line 1325
{
get_spec_stats (s2); Line 1327
if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1) Line 1328
{
s2->indefinite_repeat_element->u.repeated_char.repeat_count = Line 1330
len_s1 - s2->length; Line 1331
s2->length = len_s1; Line 1332
}
} Block 34
static void Line 1336
spec_init (struct Spec_list *spec_list) Line 1337
{
struct List_element *new = xmalloc (sizeof *new); Line 1339
spec_list->head = spec_list->tail = new; Line 1340
spec_list->head->next = NULL; Line 1341
} Block 35
/* This function makes two passes over the argument string S. The first
one converts all \c and \ddd escapes to their one-byte representations.
The second constructs a linked specification list, SPEC_LIST, of the
characters and constructs that comprise the argument string. If either
of these passes detects an error, this function returns false. */
static bool Line 1350
parse_str (char const *s, struct Spec_list *spec_list) Line 1351
{
struct E_string es; Line 1353
bool ok = unquote (s, &es) && build_spec_list (&es, spec_list); Line 1354
es_free (&es); Line 1355
return ok; Line 1356
} Block 36
/* Given two specification lists, S1 and S2, and assuming that
S1->length > S2->length, append a single [c*n] element to S2 where c
is the last character in the expansion of S2 and n is the difference
between the two lengths.
Upon successful completion, S2->length is set to S1->length. The only
way this function can fail to make S2 as long as S1 is when S2 has
zero-length, since in that case, there is no last character to repeat.
So S2->length is required to be at least 1. */
static void Line 1368
string2_extend (const struct Spec_list *s1, struct Spec_list *s2) Line 1369
{
struct List_element *p; Line 1371
unsigned char char_to_repeat; Line 1372
assert (translating); Line 1374
assert (s1->length > s2->length); Line 1375
assert (s2->length > 0); Line 1376
p = s2->tail; Line 1378
switch (p->type) Line 1379
{
case RE_NORMAL_CHAR: Line 1381
char_to_repeat = p->u.normal_char; Line 1382
break; Line 1383
case RE_RANGE: Line 1384
char_to_repeat = p->u.range.last_char; Line 1385
break; Line 1386
case RE_CHAR_CLASS: Line 1387
/* Note BSD allows extending of classes in string2. For example:
tr '[:upper:]0-9' '[:lower:]'
That's not portable however, contradicts POSIX and is dependent
on your collating sequence. */
die (EXIT_FAILURE, 0, Line 1392
_("when translating with string1 longer than string2,\nthe\ Line 1393
latter string must not end with a character class")); Line 1394
case RE_REPEATED_CHAR: Line 1396
char_to_repeat = p->u.repeated_char.the_repeated_char; Line 1397
break; Line 1398
case RE_EQUIV_CLASS: Line 1400
/* This shouldn't happen, because validate exits with an error
if it finds an equiv class in string2 when translating. */
abort (); ...!common auto-comment...
default: Line 1405
abort (); ...!common auto-comment...
}
append_repeated_char (s2, char_to_repeat, s1->length - s2->length); Line 1409
s2->length = s1->length; Line 1410
} Block 37
/* Return true if S is a non-empty list in which exactly one
character (but potentially, many instances of it) appears.
E.g., [X*] or xxxxxxxx. */
static bool Line 1417
homogeneous_spec_list (struct Spec_list *s) Line 1418
{
int b, c; Line 1420
s->state = BEGIN_STATE; Line 1422
if ((b = get_next (s, NULL)) == -1) Line 1424
return false; Line 1425
while ((c = get_next (s, NULL)) != -1) Line 1427
if (c != b) Line 1428
return false; Line 1429
return true; Line 1431
} Block 38
/* Die with an error message if S1 and S2 describe strings that
are not valid with the given command line switches.
A side effect of this function is that if a valid [c*] or
[c*0] construct appears in string2, it is converted to [c*n]
with a value for n that makes s2->length == s1->length. By
the same token, if the --truncate-set1 option is not
given, S2 may be extended. */
static void Line 1442
validate (struct Spec_list *s1, struct Spec_list *s2) Line 1443
{
get_s1_spec_stats (s1); Line 1445
if (s1->n_indefinite_repeats > 0) Line 1446
{
die (EXIT_FAILURE, 0, Line 1448
_("the [c*] repeat construct may not appear in string1")); Line 1449
}
if (s2) Line 1452
{
get_s2_spec_stats (s2, s1->length); Line 1454
if (s2->n_indefinite_repeats > 1) Line 1456
{
die (EXIT_FAILURE, 0, Line 1458
_("only one [c*] repeat construct may appear in string2")); Line 1459
}
if (translating) Line 1462
{
if (s2->has_equiv_class) Line 1464
{
die (EXIT_FAILURE, 0, Line 1466
_("[=c=] expressions may not appear in string2\ Line 1467
when translating")); Line 1468
}
if (s2->has_restricted_char_class) Line 1471
{
die (EXIT_FAILURE, 0, Line 1473
_("when translating, the only character classes that may\ Line 1474
appear in\nstring2 are 'upper' and 'lower'")); Line 1475
}
validate_case_classes (s1, s2); Line 1478
if (s1->length > s2->length) Line 1480
{
if (!truncate_set1) Line 1482
{
/* string2 must be non-empty unless --truncate-set1 is
given or string1 is empty. */
if (s2->length == 0) Line 1487
die (EXIT_FAILURE, 0, Line 1488
_("when not truncating set1, string2 must be non-empty")); Line 1489
string2_extend (s1, s2); Line 1490
}
}
if (complement && s1->has_char_class Line 1494
&& ! (s2->length == s1->length && homogeneous_spec_list (s2))) Line 1495
{
die (EXIT_FAILURE, 0, Line 1497
_("when translating with complemented character classes,\ Line 1498
\nstring2 must map all characters in the domain to one")); Line 1499
}
}
else Line 1502
/* Not translating. */
{
if (s2->n_indefinite_repeats > 0) Line 1505
die (EXIT_FAILURE, 0, Line 1506
_("the [c*] construct may appear in string2 only\ Line 1507
when translating")); Line 1508
}
}
} Block 39
/* Read buffers of SIZE bytes via the function READER (if READER is
NULL, read from stdin) until EOF. When non-NULL, READER is either
read_and_delete or read_and_xlate. After each buffer is read, it is
processed and written to stdout. The buffers are processed so that
multiple consecutive occurrences of the same character in the input
stream are replaced by a single occurrence of that character if the
character is in the squeeze set. */
static void Line 1521
squeeze_filter (char *buf, size_t size, size_t (*reader) (char *, size_t)) Line 1522
{
/* A value distinct from any character that may have been stored in a
buffer as the result of a block-read in the function squeeze_filter. */
const int NOT_A_CHAR = INT_MAX; Line 1526
int char_to_squeeze = NOT_A_CHAR; Line 1528
size_t i = 0; Line 1529
size_t nr = 0; Line 1530
while (true) Line 1532
{
if (i >= nr) Line 1534
{
nr = reader (buf, size); Line 1536
if (nr == 0) Line 1537
break; Line 1538
i = 0; Line 1539
}
size_t begin = i; Line 1542
if (char_to_squeeze == NOT_A_CHAR) Line 1544
{
size_t out_len; Line 1546
/* Here, by being a little tricky, we can get a significant
performance increase in most cases when the input is
reasonably large. Since tr will modify the input only
if two consecutive (and identical) input characters are
in the squeeze set, we can step by two through the data
when searching for a character in the squeeze set. This
means there may be a little more work in a few cases and
perhaps twice as much work in the worst cases where most
of the input is removed by squeezing repeats. But most
uses of this functionality seem to remove less than 20-30%
of the input. */
for (; i < nr && !in_squeeze_set[to_uchar (buf[i])]; i += 2) Line 1558
continue; Line 1559
/* There is a special case when i == nr and we've just
skipped a character (the last one in buf) that is in
the squeeze set. */
if (i == nr && in_squeeze_set[to_uchar (buf[i - 1])]) Line 1564
--i; Line 1565
if (i >= nr) Line 1567
out_len = nr - begin; Line 1568
else Line 1569
{
char_to_squeeze = buf[i]; Line 1571
/* We're about to output buf[begin..i]. */
out_len = i - begin + 1; Line 1573
/* But since we stepped by 2 in the loop above,
out_len may be one too large. */
if (i > 0 && buf[i - 1] == char_to_squeeze) Line 1577
--out_len; Line 1578
/* Advance i to the index of first character to be
considered when looking for a char different from
char_to_squeeze. */
++i; Line 1583
}
if (out_len > 0 Line 1585
&& fwrite (&buf[begin], 1, out_len, stdout) != out_len) Line 1586...!syscalls auto-comment...
die (EXIT_FAILURE, errno, _("write error")); Line 1587
}
if (char_to_squeeze != NOT_A_CHAR) Line 1590
{
/* Advance i to index of first char != char_to_squeeze
(or to nr if all the rest of the characters in this
buffer are the same as char_to_squeeze). */
for (; i < nr && buf[i] == char_to_squeeze; i++) Line 1595
continue; Line 1596
if (i < nr) Line 1597
char_to_squeeze = NOT_A_CHAR; Line 1598
/* If (i >= nr) we've squeezed the last character in this buffer.
So now we have to read a new buffer and continue comparing
characters against char_to_squeeze. */
}
}
}
static size_t Line 1606
plain_read (char *buf, size_t size) Line 1607...!syscalls auto-comment...
{
size_t nr = safe_read (STDIN_FILENO, buf, size); Line 1609...!syscalls auto-comment...
if (nr == SAFE_READ_ERROR) Line 1610
die (EXIT_FAILURE, errno, _("read error")); Line 1611
return nr; Line 1612
} Block 41
/* Read buffers of SIZE bytes from stdin until one is found that
contains at least one character not in the delete set. Store
in the array BUF, all characters from that buffer that are not
in the delete set, and return the number of characters saved
or 0 upon EOF. */
static size_t Line 1621
read_and_delete (char *buf, size_t size) Line 1622
{
size_t n_saved; Line 1624
/* This enclosing do-while loop is to make sure that
we don't return zero (indicating EOF) when we've
just deleted all the characters in a buffer. */
do
{
size_t nr = plain_read (buf, size); Line 1631...!syscalls auto-comment...
if (nr == 0) Line 1633
return 0; Line 1634
/* This first loop may be a waste of code, but gives much
better performance when no characters are deleted in
the beginning of a buffer. It just avoids the copying
of buf[i] into buf[n_saved] when it would be a NOP. */
size_t i; Line 1641
for (i = 0; i < nr && !in_delete_set[to_uchar (buf[i])]; i++) Line 1642
continue; Line 1643
n_saved = i; Line 1644
for (++i; i < nr; i++) Line 1646
if (!in_delete_set[to_uchar (buf[i])]) Line 1647
buf[n_saved++] = buf[i]; Line 1648
}
while (n_saved == 0); Line 1650
return n_saved; Line 1652
} Block 42
/* Read at most SIZE bytes from stdin into the array BUF. Then
perform the in-place and one-to-one mapping specified by the global
array 'xlate'. Return the number of characters read, or 0 upon EOF. */
static size_t Line 1659
read_and_xlate (char *buf, size_t size) Line 1660
{
size_t bytes_read = plain_read (buf, size); Line 1662...!syscalls auto-comment...
for (size_t i = 0; i < bytes_read; i++) Line 1664
buf[i] = xlate[to_uchar (buf[i])]; Line 1665
return bytes_read; Line 1667
} Block 43
/* Initialize a boolean membership set, IN_SET, with the character
values obtained by traversing the linked list of constructs S
using the function 'get_next'. IN_SET is expected to have been
initialized to all zeros by the caller. If COMPLEMENT_THIS_SET
is true the resulting set is complemented. */
static void Line 1676
set_initialize (struct Spec_list *s, bool complement_this_set, bool *in_set) Line 1677
{
int c; Line 1679
s->state = BEGIN_STATE; Line 1681
while ((c = get_next (s, NULL)) != -1) Line 1682
in_set[c] = true; Line 1683
if (complement_this_set) Line 1684
for (size_t i = 0; i < N_CHARS; i++) Line 1685
in_set[i] = (!in_set[i]); Line 1686
} Block 44
int
main (int argc, char **argv) Line 1690
{
int c; Line 1692
int non_option_args; Line 1693
int min_operands; Line 1694
int max_operands; Line 1695
struct Spec_list buf1, buf2; Line 1696
struct Spec_list *s1 = &buf1; Line 1697
struct Spec_list *s2 = &buf2; Line 1698
initialize_main (&argc, &argv); VMS-specific entry point handling wildcard expansion
set_program_name (argv[0]); Retains program name and discards path
setlocale (LC_ALL, ""); Sets up internationalization (i18n)
bindtextdomain (PACKAGE, LOCALEDIR); Assigns i18n directorySets text domain for _() [gettext()] function
textdomain (PACKAGE); Sets text domain for _() [gettext()] function
atexit (close_stdout); Close stdout on exit (see gnulib)
while ((c = getopt_long (argc, argv, "+AcCdst", long_options, NULL)) != -1) Line 1708
{
switch (c) Line 1710
{
case 'A': Line 1712
/* Undocumented option, for compatibility with AIX. */
setlocale (LC_COLLATE, "C"); Sets up internationalization (i18n)
setlocale (LC_CTYPE, "C"); Sets up internationalization (i18n)
break; Line 1716
case 'c': Line 1718
case 'C': Line 1719
complement = true; Line 1720
break; Line 1721
case 'd': Line 1723
delete = true; Line 1724
break; Line 1725
case 's': Line 1727
squeeze_repeats = true; Line 1728
break; Line 1729
case 't': Line 1731
truncate_set1 = true; Line 1732
break; Line 1733
case_GETOPT_HELP_CHAR; Line 1735
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); Line 1737
default: Line 1739
usage (EXIT_FAILURE); Line 1740
break; Line 1741
}
}
non_option_args = argc - optind; Line 1745
translating = (non_option_args == 2 && !delete); Line 1746
min_operands = 1 + (delete == squeeze_repeats); Line 1747
max_operands = 1 + (delete <= squeeze_repeats); Line 1748
if (non_option_args < min_operands) Line 1750
{
if (non_option_args == 0) Line 1752
error (0, 0, _("missing operand")); Line 1753
else Line 1754
{
error (0, 0, _("missing operand after %s"), quote (argv[argc - 1])); Line 1756
fprintf (stderr, "%s\n", Line 1757
_(squeeze_repeats Line 1758
? N_("Two strings must be given when " Line 1759
"both deleting and squeezing repeats.") Line 1760
: N_("Two strings must be given when translating."))); Line 1761
}
usage (EXIT_FAILURE); Line 1763
}
if (max_operands < non_option_args) Line 1766
{
error (0, 0, _("extra operand %s"), quote (argv[optind + max_operands])); Line 1768
if (non_option_args == 2) Line 1769
fprintf (stderr, "%s\n", Line 1770
_("Only one string may be given when " Line 1771
"deleting without squeezing repeats.")); Line 1772
usage (EXIT_FAILURE); Line 1773
}
spec_init (s1); Line 1776
if (!parse_str (argv[optind], s1)) Line 1777
return EXIT_FAILURE; Line 1778
if (non_option_args == 2) Line 1780
{
spec_init (s2); Line 1782
if (!parse_str (argv[optind + 1], s2)) Line 1783
return EXIT_FAILURE; Line 1784
}
else Line 1786
s2 = NULL; Line 1787
validate (s1, s2); Line 1789
/* Use binary I/O, since 'tr' is sometimes used to transliterate
non-printable characters, or characters which are stripped away
by text-mode reads (like CR and ^Z). */
xset_binary_mode (STDIN_FILENO, O_BINARY); Line 1794
xset_binary_mode (STDOUT_FILENO, O_BINARY); Line 1795
fadvise (stdin, FADVISE_SEQUENTIAL); Line 1796...!syscalls auto-comment...
if (squeeze_repeats && non_option_args == 1) Line 1798
{
set_initialize (s1, complement, in_squeeze_set); Line 1800
squeeze_filter (io_buf, sizeof io_buf, plain_read); Line 1801
}
else if (delete && non_option_args == 1) Line 1803
{
set_initialize (s1, complement, in_delete_set); Line 1805
while (true) Line 1807
{
size_t nr = read_and_delete (io_buf, sizeof io_buf); Line 1809
if (nr == 0) Line 1810
break; Line 1811
if (fwrite (io_buf, 1, nr, stdout) != nr) Line 1812...!syscalls auto-comment...
die (EXIT_FAILURE, errno, _("write error")); Line 1813
}
}
else if (squeeze_repeats && delete && non_option_args == 2) Line 1816
{
set_initialize (s1, complement, in_delete_set); Line 1818
set_initialize (s2, false, in_squeeze_set); Line 1819
squeeze_filter (io_buf, sizeof io_buf, read_and_delete); Line 1820
}
else if (translating) Line 1822
{
if (complement) Line 1824
{
bool *in_s1 = in_delete_set; Line 1826
set_initialize (s1, false, in_s1); Line 1828
s2->state = BEGIN_STATE; Line 1829
for (int i = 0; i < N_CHARS; i++) Line 1830
xlate[i] = i; Line 1831
for (int i = 0; i < N_CHARS; i++) Line 1832
{
if (!in_s1[i]) Line 1834
{
int ch = get_next (s2, NULL); Line 1836
assert (ch != -1 || truncate_set1); Line 1837
if (ch == -1) Line 1838
{
/* This will happen when tr is invoked like e.g.
tr -cs A-Za-z0-9 '\012'. */
break; Line 1842
}
xlate[i] = ch; Line 1844
}
}
}
else Line 1848
{
int c1, c2; Line 1850
enum Upper_Lower_class class_s1; Line 1851
enum Upper_Lower_class class_s2; Line 1852
for (int i = 0; i < N_CHARS; i++) Line 1854
xlate[i] = i; Line 1855
s1->state = BEGIN_STATE; Line 1856
s2->state = BEGIN_STATE; Line 1857
while (true) Line 1858
{
c1 = get_next (s1, &class_s1); Line 1860
c2 = get_next (s2, &class_s2); Line 1861
if (class_s1 == UL_LOWER && class_s2 == UL_UPPER) Line 1863
{
for (int i = 0; i < N_CHARS; i++) Line 1865
if (islower (i)) Line 1866
xlate[i] = toupper (i); Line 1867
}
else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER) Line 1869
{
for (int i = 0; i < N_CHARS; i++) Line 1871
if (isupper (i)) Line 1872
xlate[i] = tolower (i); Line 1873
}
else Line 1875
{
/* The following should have been checked by validate... */
if (c1 == -1 || c2 == -1) Line 1878
break; Line 1879
xlate[c1] = c2; Line 1880
}
/* When case-converting, skip the elements as an optimization. */
if (class_s2 != UL_NONE) Line 1884
{
skip_construct (s1); Line 1886
skip_construct (s2); Line 1887
}
}
assert (c1 == -1 || truncate_set1); Line 1890
}
if (squeeze_repeats) Line 1892
{
set_initialize (s2, false, in_squeeze_set); Line 1894
squeeze_filter (io_buf, sizeof io_buf, read_and_xlate); Line 1895
}
else Line 1897
{
while (true) Line 1899
{
size_t bytes_read = read_and_xlate (io_buf, sizeof io_buf); Line 1901
if (bytes_read == 0) Line 1902
break; Line 1903
if (fwrite (io_buf, 1, bytes_read, stdout) != bytes_read) Line 1904...!syscalls auto-comment...
die (EXIT_FAILURE, errno, _("write error")); Line 1905
}
}
}
if (close (STDIN_FILENO) != 0) Line 1910...!syscalls auto-comment...
die (EXIT_FAILURE, errno, _("standard input")); Line 1911
return EXIT_SUCCESS; Line 1913
} Block 45