Skip to content

Commit f388214

Browse files
authored
wc: fix word undercount with invalid byte sequences (#10348)
* wc: fix word undercount with invalid byte sequences * wc: update utf8 test counts to account invalid byte sequences * wc: remove unnecessary borrow in test
1 parent cbbff30 commit f388214

File tree

2 files changed

+21
-3
lines changed

2 files changed

+21
-3
lines changed

src/uu/wc/src/wc.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -624,10 +624,18 @@ fn process_chunk<
624624
total.max_line_length = max(*current_len, total.max_line_length);
625625
}
626626

627-
fn handle_error(error: BufReadDecoderError<'_>, total: &mut WordCount) -> Option<io::Error> {
627+
fn handle_error(
628+
error: BufReadDecoderError<'_>,
629+
total: &mut WordCount,
630+
in_word: &mut bool,
631+
) -> Option<io::Error> {
628632
match error {
629633
BufReadDecoderError::InvalidByteSequence(bytes) => {
630634
total.bytes += bytes.len();
635+
if !(*in_word) {
636+
*in_word = true;
637+
total.words += 1;
638+
}
631639
}
632640
BufReadDecoderError::Io(e) => return Some(e),
633641
}
@@ -660,7 +668,7 @@ fn word_count_from_reader_specialized<
660668
);
661669
}
662670
Err(e) => {
663-
if let Some(e) = handle_error(e, &mut total) {
671+
if let Some(e) = handle_error(e, &mut total, &mut in_word) {
664672
return (total, Some(e));
665673
}
666674
}

tests/by-util/test_wc.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ fn test_utf8() {
6565
.args(&["-lwmcL"])
6666
.pipe_in_fixture("UTF_8_test.txt")
6767
.succeeds()
68-
.stdout_is(" 303 2119 22457 23025 79\n");
68+
.stdout_is(" 303 2178 22457 23025 79\n");
6969
}
7070

7171
#[test]
@@ -826,6 +826,16 @@ fn wc_w_words_with_emoji_separator() {
826826
.stdout_contains("3");
827827
}
828828

829+
#[test]
830+
fn test_invalid_byte_sequence_word_count() {
831+
// wc should count invalid byte sequences as words
832+
// Input: "a \xff b\n" should produce: 1 line, 3 words, 6 bytes
833+
new_ucmd!()
834+
.pipe_in([b'a', b' ', 0xff, b' ', b'b', b'\n'])
835+
.succeeds()
836+
.stdout_is(" 1 3 6\n");
837+
}
838+
829839
#[cfg(unix)]
830840
#[test]
831841
fn test_simd_respects_glibc_tunables() {

0 commit comments

Comments
 (0)