Skip to content

Commit 33d1d85

Browse files
quinnjnickrobinson251
authored andcommitted
Don't pool small inline string columns by default
Fixes #982. Since the size of `String1` and `String3` are <= the size of the ref integer type we use for pooling (`UInt32`), let's avoid pooling them by default. Users can still request specific columns be pooled like always.
1 parent 3d47734 commit 33d1d85

File tree

3 files changed

+23
-12
lines changed

3 files changed

+23
-12
lines changed

src/utils.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ tupcat(::Type{Tuple{T, T2, T3}}, S) where {T, T2, T3} = Tuple{T, T2, T3, S}
5151
tupcat(::Type{Tuple{T, T2, T3, T4}}, S) where {T, T2, T3, T4} = Tuple{T, T2, T3, T4, S}
5252
tupcat(::Type{T}, S) where {T <: Tuple} = Tuple{Any[(fieldtype(T, i) for i = 1:fieldcount(T))..., S]...}
5353

54-
const StringTypes = Union{Type{String}, Type{PosLenString}, Type{<:InlineString}}
54+
const StringTypes = Union{Type{String}, Type{PosLenString}, Type{InlineString}, Type{String7}, Type{String15}, Type{String31}, Type{String63}, Type{String127}, Type{String255}}
5555
pickstringtype(T, maxstringsize) = T === InlineString ? (maxstringsize < DEFAULT_MAX_INLINE_STRING_LENGTH ? InlineStringType(maxstringsize) : String) : T
5656

5757
# we define our own bit flag on a Parsers.ReturnCode to signal if a column needs to promote to string

test/basics.jl

+11
Original file line numberDiff line numberDiff line change
@@ -819,4 +819,15 @@ f = CSV.File(IOBuffer(str); delim=" ", header=false, types=(i,nm) -> (i == 5 ? I
819819
f = CSV.File(IOBuffer(str); delim=" ", header=false, types=Dict(r".*" => Float16))
820820
@test Float16 <: eltype(f.Column5)
821821

822+
# 982
823+
data = """a,b,c,d
824+
A,BB,CCC,DDDD
825+
A,BB,CCC,DDDD
826+
"""
827+
f = CSV.File(IOBuffer(data))
828+
@test !(f.a isa PooledArray)
829+
@test !(f.b isa PooledArray)
830+
@test !(f.c isa PooledArray)
831+
@test f.d isa PooledArray
832+
822833
end

test/runtests.jl

+11-11
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,27 @@ include("write.jl")
2424

2525
@testset "PooledArrays" begin
2626

27-
f = CSV.File(IOBuffer("X\nb\nc\na\nc"), pool=true)
28-
@test typeof(f.X) == PooledArrays.PooledArray{InlineString1,UInt32,1,Array{UInt32,1}}
27+
f = CSV.File(IOBuffer("X\nbbbb\ncccc\naaaa\ncccc"), pool=true)
28+
@test typeof(f.X) == PooledArrays.PooledArray{String7,UInt32,1,Array{UInt32,1}}
2929
@test (length(f), length(f.names)) == (4, 1)
30-
@test f.X == ["b", "c", "a", "c"]
30+
@test f.X == ["bbbb", "cccc", "aaaa", "cccc"]
3131
@test f.X.refs[2] == f.X.refs[4]
3232

33-
f = CSV.File(IOBuffer("X\nb\nc\na\nc"), pool=0.75)
34-
@test typeof(f.X) == PooledArrays.PooledArray{InlineString1,UInt32,1,Array{UInt32,1}}
33+
f = CSV.File(IOBuffer("X\nbbbb\ncccc\naaaa\ncccc"), pool=0.75)
34+
@test typeof(f.X) == PooledArrays.PooledArray{String7,UInt32,1,Array{UInt32,1}}
3535
@test (length(f), length(f.names)) == (4, 1)
36-
@test f.X == ["b", "c", "a", "c"]
36+
@test f.X == ["bbbb", "cccc", "aaaa", "cccc"]
3737
@test f.X.refs[2] == f.X.refs[4]
3838

39-
f = CSV.File(IOBuffer("X\nb\nc\n\nc"), pool=true, ignoreemptyrows=false)
40-
@test typeof(f.X) == PooledArray{Union{Missing, InlineString1},UInt32,1,Array{UInt32,1}}
39+
f = CSV.File(IOBuffer("X\nbbbb\ncccc\n\ncccc"), pool=true, ignoreemptyrows=false)
40+
@test typeof(f.X) == PooledArray{Union{Missing, String7},UInt32,1,Array{UInt32,1}}
4141
@test (length(f), length(f.names)) == (4, 1)
4242
@test f.X[3] === missing
4343

44-
f = CSV.File(IOBuffer("X\nc\nc\n\nc\nc\nc\nc\nc\nc"), pool=0.25, ignoreemptyrows=false)
45-
@test typeof(f.X) == PooledArray{Union{Missing, InlineString1},UInt32,1,Array{UInt32,1}}
44+
f = CSV.File(IOBuffer("X\ncccc\ncccc\n\ncccc\ncccc\ncccc\ncccc\ncccc\ncccc"), pool=0.25, ignoreemptyrows=false)
45+
@test typeof(f.X) == PooledArray{Union{Missing, String7},UInt32,1,Array{UInt32,1}}
4646
@test (length(f), length(f.names)) == (9, 1)
47-
@test isequal(f.X, ["c", "c", missing, "c", "c", "c", "c", "c", "c"])
47+
@test isequal(f.X, ["cccc", "cccc", missing, "cccc", "cccc", "cccc", "cccc", "cccc", "cccc"])
4848

4949
end
5050

0 commit comments

Comments
 (0)