From a5202bd58531923ea9f93cc35ddeec5e3a8e0189 Mon Sep 17 00:00:00 2001 From: =?utf8?q?P=C3=A1draig=20Brady?=
Date: Sat, 23 Feb 2019 21:23:47 -0800
Subject: [PATCH] wc: treat non breaking space as a word separator
* src/wc.c (iswnbspace): A new function to match
characters in this class.
(isnbspace): Likewise for single byte charsets.
(main): Initialize posixly_correct from the environment,
to allow disabling honoring NBSP in non C locales.
(wc): Call is[w]nbspace() along with is[w]space.
* bootstrap.conf: Ensure btowc is available.
* tests/misc/wc-nbsp.sh: A new test.
* tests/local.mk: Reference the new test.
* NEWS: Mention the change in behavior.
---
NEWS | 3 +++
bootstrap.conf | 1 +
src/wc.c | 25 +++++++++++++++++++++++--
tests/local.mk | 1 +
tests/misc/wc-nbsp.sh | 42 ++++++++++++++++++++++++++++++++++++++++++
5 files changed, 70 insertions(+), 2 deletions(-)
create mode 100755 tests/misc/wc-nbsp.sh
diff --git a/NEWS b/NEWS
index e73cb52b8..c190efe27 100644
--- a/NEWS
+++ b/NEWS
@@ -67,6 +67,9 @@ GNU coreutils NEWS -*- outline -*-
operator, so POSIX changed this to 'test -e FILE'. Scripts using it were
already broken and non-portable; the -a unary operator was never documented.
+ wc now treats non breaking space characters as word delimiters
+ unless the POSIXLY_CORRECT environment variable is set.
+
** New features
id now supports specifying multiple users.
diff --git a/bootstrap.conf b/bootstrap.conf
index a525ef442..49261524a 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -38,6 +38,7 @@ gnulib_modules="
backup-rename
base32
base64
+ btowc
buffer-lcm
c-strcase
cl-strtod
diff --git a/src/wc.c b/src/wc.c
index 179abbe2c..23818042f 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -74,6 +74,9 @@ static bool have_read_stdin;
/* Used to determine if file size can be determined without reading. */
static size_t page_size;
+/* Enable to _not_ treat non breaking space as a word separator. */
+static bool posixly_correct;
+
/* The result of calling fstat or stat on a file descriptor or file. */
struct fstatus
{
@@ -147,6 +150,21 @@ the following order: newline, word, character, byte, maximum line length.\n\
exit (status);
}
+/* Return non zero if a non breaking space. */
+static int _GL_ATTRIBUTE_PURE
+iswnbspace (wint_t wc)
+{
+ return ! posixly_correct
+ && (wc == 0x00A0 || wc == 0x2007
+ || wc == 0x202F || wc == 0x2060);
+}
+
+static int
+isnbspace (int c)
+{
+ return iswnbspace (btowc (c));
+}
+
/* FILE is the name of the file (or NULL for standard input)
associated with the specified counters. */
static void
@@ -455,7 +473,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
if (width > 0)
linepos += width;
}
- if (iswspace (wide_char))
+ if (iswspace (wide_char) || iswnbspace (wide_char))
goto mb_word_separator;
in_word = true;
}
@@ -538,7 +556,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
if (isprint (to_uchar (p[-1])))
{
linepos++;
- if (isspace (to_uchar (p[-1])))
+ if (isspace (to_uchar (p[-1]))
+ || isnbspace (to_uchar (p[-1])))
goto word_separator;
in_word = true;
}
@@ -681,6 +700,8 @@ main (int argc, char **argv)
so that processes running in parallel do not intersperse their output. */
setvbuf (stdout, NULL, _IOLBF, 0);
+ posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
+
print_lines = print_words = print_chars = print_bytes = false;
print_linelength = false;
total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
diff --git a/tests/local.mk b/tests/local.mk
index bcb61edc7..add379a47 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -273,6 +273,7 @@ all_tests = \
tests/misc/wc.pl \
tests/misc/wc-files0-from.pl \
tests/misc/wc-files0.sh \
+ tests/misc/wc-nbsp.sh \
tests/misc/wc-parallel.sh \
tests/misc/wc-proc.sh \
tests/misc/cat-proc.sh \
diff --git a/tests/misc/wc-nbsp.sh b/tests/misc/wc-nbsp.sh
new file mode 100755
index 000000000..11ee0d655
--- /dev/null
+++ b/tests/misc/wc-nbsp.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+# Test non breaking space handling
+
+# Copyright (C) 2019 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see