-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Copy pathcount_fast.rs
237 lines (221 loc) · 9.3 KB
/
count_fast.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// cSpell:ignore sysconf
use crate::word_count::WordCount;
use super::WordCountable;
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::fs::OpenOptions;
use std::io::{self, ErrorKind, Read};
#[cfg(unix)]
use libc::{_SC_PAGESIZE, S_IFREG, sysconf};
#[cfg(unix)]
use nix::sys::stat;
#[cfg(unix)]
use std::io::{Seek, SeekFrom};
#[cfg(any(target_os = "linux", target_os = "android"))]
use std::os::fd::{AsFd, AsRawFd};
#[cfg(windows)]
use std::os::windows::fs::MetadataExt;
#[cfg(windows)]
const FILE_ATTRIBUTE_ARCHIVE: u32 = 32;
#[cfg(windows)]
const FILE_ATTRIBUTE_NORMAL: u32 = 128;
#[cfg(any(target_os = "linux", target_os = "android"))]
use libc::S_IFIFO;
#[cfg(any(target_os = "linux", target_os = "android"))]
use uucore::pipes::{pipe, splice, splice_exact};
const BUF_SIZE: usize = 16 * 1024;
#[cfg(any(target_os = "linux", target_os = "android"))]
const SPLICE_SIZE: usize = 128 * 1024;
/// This is a Linux-specific function to count the number of bytes using the
/// `splice` system call, which is faster than using `read`.
///
/// On error it returns the number of bytes it did manage to read, since the
/// caller will fall back to a simpler method.
#[inline]
#[cfg(any(target_os = "linux", target_os = "android"))]
fn count_bytes_using_splice(fd: &impl AsFd) -> Result<usize, usize> {
let null_file = OpenOptions::new()
.write(true)
.open("/dev/null")
.map_err(|_| 0_usize)?;
let null_rdev = stat::fstat(null_file.as_raw_fd())
.map_err(|_| 0_usize)?
.st_rdev as libc::dev_t;
if (libc::major(null_rdev), libc::minor(null_rdev)) != (1, 3) {
// This is not a proper /dev/null, writing to it is probably bad
// Bit of an edge case, but it has been known to happen
return Err(0);
}
let (pipe_rd, pipe_wr) = pipe().map_err(|_| 0_usize)?;
let mut byte_count = 0;
loop {
match splice(fd, &pipe_wr, SPLICE_SIZE) {
Ok(0) => break,
Ok(res) => {
byte_count += res;
// Silent the warning as we want to the error message
#[allow(clippy::question_mark)]
if splice_exact(&pipe_rd, &null_file, res).is_err() {
return Err(byte_count);
}
}
Err(_) => return Err(byte_count),
};
}
Ok(byte_count)
}
/// In the special case where we only need to count the number of bytes. There
/// are several optimizations we can do:
/// 1. On Unix, we can simply `stat` the file if it is regular.
/// 2. On Linux -- if the above did not work -- we can use splice to count
/// the number of bytes if the file is a FIFO.
/// 3. On Windows we can use `std::os::windows::fs::MetadataExt` to get file size
/// for regular files
/// 3. Otherwise, we just read normally, but without the overhead of counting
/// other things such as lines and words.
#[inline]
pub(crate) fn count_bytes_fast<T: WordCountable>(handle: &mut T) -> (usize, Option<io::Error>) {
let mut byte_count = 0;
#[cfg(unix)]
{
let fd = handle.as_raw_fd();
if let Ok(stat) = stat::fstat(fd) {
// If the file is regular, then the `st_size` should hold
// the file's size in bytes.
// If stat.st_size = 0 then
// - either the size is 0
// - or the size is unknown.
// The second case happens for files in pseudo-filesystems.
// For example with /proc/version.
// So, if it is 0 we don't report that and instead do a full read.
//
// Another thing to consider for files in pseudo-filesystems like /proc, /sys
// and similar is that they could report `st_size` greater than actual content.
// For example /sys/kernel/profiling could report `st_size` equal to
// system page size (typically 4096 on 64bit system), while it's file content
// would count up only to a couple of bytes.
// This condition usually occurs for files in pseudo-filesystems like /proc, /sys
// that report `st_size` in the multiples of system page size.
// In such cases - attempt `seek()` almost to the end of the file
// and then fall back on read to count the rest.
//
// And finally a special case of input redirection in *nix shell:
// `( wc -c ; wc -c ) < file` should return
// ```
// size_of_file
// 0
// ```
// Similarly
// `( head -c1 ; wc -c ) < file` should return
// ```
// first_byte_of_file
// size_of_file - 1
// ```
// Since the input stream from file is treated as continuous across both commands inside ().
// In cases like this, due to `<` redirect, the `stat.st_mode` would report input as a regular file
// and `stat.st_size` would report the size of file on disk
// and NOT the remaining number of bytes in the input stream.
// However, the raw file descriptor in this situation would be equal to `0`
// for STDIN in both invocations.
// Therefore we cannot rely of `st_size` here and should fall back on full read.
if fd > 0 && (stat.st_mode as libc::mode_t & S_IFREG) != 0 && stat.st_size > 0 {
let sys_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize };
if stat.st_size as usize % sys_page_size > 0 {
// regular file or file from /proc, /sys and similar pseudo-filesystems
// with size that is NOT a multiple of system page size
return (stat.st_size as usize, None);
} else if let Some(file) = handle.inner_file() {
// On some platforms `stat.st_blksize` and `stat.st_size`
// are of different types: i64 vs i32
// i.e. MacOS on Apple Silicon (aarch64-apple-darwin),
// Debian Linux on ARM (aarch64-unknown-linux-gnu),
// 32bit i686 targets, etc.
// While on the others they are of the same type.
#[allow(clippy::useless_conversion)]
let size = i64::from(stat.st_size);
#[allow(clippy::useless_conversion)]
let block = i64::from(stat.st_blksize);
let offset = size - size % (block + 1);
if let Ok(n) = file.seek(SeekFrom::Start(offset as u64)) {
byte_count = n as usize;
}
}
}
#[cfg(any(target_os = "linux", target_os = "android"))]
{
// Else, if we're on Linux and our file is a FIFO pipe
// (or stdin), we use splice to count the number of bytes.
if (stat.st_mode as libc::mode_t & S_IFIFO) != 0 {
match count_bytes_using_splice(handle) {
Ok(n) => return (n, None),
Err(n) => byte_count = n,
}
}
}
}
}
#[cfg(windows)]
{
if let Some(file) = handle.inner_file() {
if let Ok(metadata) = file.metadata() {
let attributes = metadata.file_attributes();
if (attributes & FILE_ATTRIBUTE_ARCHIVE) != 0
|| (attributes & FILE_ATTRIBUTE_NORMAL) != 0
{
return (metadata.file_size() as usize, None);
}
}
}
}
// Fall back on `read`, but without the overhead of counting words and lines.
let mut buf = [0_u8; BUF_SIZE];
loop {
match handle.read(&mut buf) {
Ok(0) => return (byte_count, None),
Ok(n) => {
byte_count += n;
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return (byte_count, Some(e)),
}
}
}
/// Returns a WordCount that counts the number of bytes, lines, and/or the number of Unicode characters encoded in UTF-8 read via a Reader.
///
/// This corresponds to the `-c`, `-l` and `-m` command line flags to wc.
///
/// # Arguments
///
/// * `R` - A Reader from which the UTF-8 stream will be read.
pub(crate) fn count_bytes_chars_and_lines_fast<
R: Read,
const COUNT_BYTES: bool,
const COUNT_CHARS: bool,
const COUNT_LINES: bool,
>(
handle: &mut R,
) -> (WordCount, Option<io::Error>) {
let mut total = WordCount::default();
let mut buf = [0; BUF_SIZE];
loop {
match handle.read(&mut buf) {
Ok(0) => return (total, None),
Ok(n) => {
if COUNT_BYTES {
total.bytes += n;
}
if COUNT_CHARS {
total.chars += bytecount::num_chars(&buf[..n]);
}
if COUNT_LINES {
total.lines += bytecount::count(&buf[..n], b'\n');
}
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return (total, Some(e)),
}
}
}