Skip to content

Commit 6e89d22

Browse files
committed
apacheGH-38676: [Python] Fix potential deadlock when CSV reading errors out
1 parent 160d45c commit 6e89d22

File tree

9 files changed

+92
-20
lines changed

9 files changed

+92
-20
lines changed

python/pyarrow/_csv.pyx

+2-3
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ from collections.abc import Mapping
2626

2727
from pyarrow.includes.common cimport *
2828
from pyarrow.includes.libarrow cimport *
29-
from pyarrow.includes.libarrow_python cimport (MakeInvalidRowHandler,
30-
PyInvalidRowCallback)
29+
from pyarrow.includes.libarrow_python cimport *
3130
from pyarrow.lib cimport (check_status, Field, MemoryPool, Schema,
3231
RecordBatchReader, ensure_type,
3332
maybe_unbox_memory_pool, get_input_stream,
@@ -1251,7 +1250,7 @@ def read_csv(input_file, read_options=None, parse_options=None,
12511250
CCSVParseOptions c_parse_options
12521251
CCSVConvertOptions c_convert_options
12531252
CIOContext io_context
1254-
shared_ptr[CCSVReader] reader
1253+
SharedPtrNoGIL[CCSVReader] reader
12551254
shared_ptr[CTable] table
12561255

12571256
_get_reader(input_file, read_options, &stream)

python/pyarrow/_dataset.pxd

+4-4
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ cdef CFileSource _make_file_source(object file, FileSystem filesystem=*)
3131
cdef class DatasetFactory(_Weakrefable):
3232

3333
cdef:
34-
shared_ptr[CDatasetFactory] wrapped
34+
SharedPtrNoGIL[CDatasetFactory] wrapped
3535
CDatasetFactory* factory
3636

3737
cdef init(self, const shared_ptr[CDatasetFactory]& sp)
@@ -45,7 +45,7 @@ cdef class DatasetFactory(_Weakrefable):
4545
cdef class Dataset(_Weakrefable):
4646

4747
cdef:
48-
shared_ptr[CDataset] wrapped
48+
SharedPtrNoGIL[CDataset] wrapped
4949
CDataset* dataset
5050
public dict _scan_options
5151

@@ -59,7 +59,7 @@ cdef class Dataset(_Weakrefable):
5959

6060
cdef class Scanner(_Weakrefable):
6161
cdef:
62-
shared_ptr[CScanner] wrapped
62+
SharedPtrNoGIL[CScanner] wrapped
6363
CScanner* scanner
6464

6565
cdef void init(self, const shared_ptr[CScanner]& sp)
@@ -122,7 +122,7 @@ cdef class FileWriteOptions(_Weakrefable):
122122
cdef class Fragment(_Weakrefable):
123123

124124
cdef:
125-
shared_ptr[CFragment] wrapped
125+
SharedPtrNoGIL[CFragment] wrapped
126126
CFragment* fragment
127127

128128
cdef void init(self, const shared_ptr[CFragment]& sp)

python/pyarrow/_dataset.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -3227,7 +3227,7 @@ cdef class RecordBatchIterator(_Weakrefable):
32273227
object iterator_owner
32283228
# Iterator is a non-POD type and Cython uses offsetof, leading
32293229
# to a compiler warning unless wrapped like so
3230-
shared_ptr[CRecordBatchIterator] iterator
3230+
SharedPtrNoGIL[CRecordBatchIterator] iterator
32313231

32323232
def __init__(self):
32333233
_forbid_instantiation(self.__class__, subclasses_instead=False)
@@ -3273,7 +3273,7 @@ cdef class TaggedRecordBatchIterator(_Weakrefable):
32733273
"""An iterator over a sequence of record batches with fragments."""
32743274
cdef:
32753275
object iterator_owner
3276-
shared_ptr[CTaggedRecordBatchIterator] iterator
3276+
SharedPtrNoGIL[CTaggedRecordBatchIterator] iterator
32773277

32783278
def __init__(self):
32793279
_forbid_instantiation(self.__class__, subclasses_instead=False)

python/pyarrow/_parquet.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import warnings
2424
from cython.operator cimport dereference as deref
2525
from pyarrow.includes.common cimport *
2626
from pyarrow.includes.libarrow cimport *
27+
from pyarrow.includes.libarrow_python cimport *
2728
from pyarrow.lib cimport (_Weakrefable, Buffer, Schema,
2829
check_status,
2930
MemoryPool, maybe_unbox_memory_pool,
@@ -1165,7 +1166,7 @@ cdef class ParquetReader(_Weakrefable):
11651166
cdef:
11661167
object source
11671168
CMemoryPool* pool
1168-
unique_ptr[FileReader] reader
1169+
UniquePtrNoGIL[FileReader] reader
11691170
FileMetaData _metadata
11701171
shared_ptr[CRandomAccessFile] rd_handle
11711172

@@ -1334,7 +1335,7 @@ cdef class ParquetReader(_Weakrefable):
13341335
vector[int] c_row_groups
13351336
vector[int] c_column_indices
13361337
shared_ptr[CRecordBatch] record_batch
1337-
unique_ptr[CRecordBatchReader] recordbatchreader
1338+
UniquePtrNoGIL[CRecordBatchReader] recordbatchreader
13381339

13391340
self.set_batch_size(batch_size)
13401341

@@ -1366,7 +1367,6 @@ cdef class ParquetReader(_Weakrefable):
13661367
check_status(
13671368
recordbatchreader.get().ReadNext(&record_batch)
13681369
)
1369-
13701370
if record_batch.get() == NULL:
13711371
break
13721372

python/pyarrow/includes/libarrow_python.pxd

+7
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,13 @@ cdef extern from "arrow/python/common.h" namespace "arrow::py":
261261
void RestorePyError(const CStatus& status) except *
262262

263263

264+
cdef extern from "arrow/python/common.h" namespace "arrow::py" nogil:
265+
cdef cppclass SharedPtrNoGIL[T](shared_ptr[T]):
266+
pass
267+
cdef cppclass UniquePtrNoGIL[T, DELETER=*](unique_ptr[T, DELETER]):
268+
pass
269+
270+
264271
cdef extern from "arrow/python/inference.h" namespace "arrow::py":
265272
c_bool IsPyBool(object o)
266273
c_bool IsPyInt(object o)

python/pyarrow/ipc.pxi

+1-1
Original file line numberDiff line numberDiff line change
@@ -977,7 +977,7 @@ cdef _wrap_record_batch_with_metadata(CRecordBatchWithMetadata c):
977977

978978
cdef class _RecordBatchFileReader(_Weakrefable):
979979
cdef:
980-
shared_ptr[CRecordBatchFileReader] reader
980+
SharedPtrNoGIL[CRecordBatchFileReader] reader
981981
shared_ptr[CRandomAccessFile] file
982982
CIpcReadOptions options
983983

python/pyarrow/lib.pxd

+2-2
Original file line numberDiff line numberDiff line change
@@ -552,12 +552,12 @@ cdef class CompressedOutputStream(NativeFile):
552552

553553
cdef class _CRecordBatchWriter(_Weakrefable):
554554
cdef:
555-
shared_ptr[CRecordBatchWriter] writer
555+
SharedPtrNoGIL[CRecordBatchWriter] writer
556556

557557

558558
cdef class RecordBatchReader(_Weakrefable):
559559
cdef:
560-
shared_ptr[CRecordBatchReader] reader
560+
SharedPtrNoGIL[CRecordBatchReader] reader
561561

562562

563563
cdef class Codec(_Weakrefable):

python/pyarrow/src/arrow/python/common.h

+50-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include <functional>
2121
#include <memory>
22+
#include <optional>
2223
#include <utility>
2324

2425
#include "arrow/buffer.h"
@@ -134,13 +135,15 @@ class ARROW_PYTHON_EXPORT PyAcquireGIL {
134135
// A RAII-style helper that releases the GIL until the end of a lexical block
135136
class ARROW_PYTHON_EXPORT PyReleaseGIL {
136137
public:
137-
PyReleaseGIL() { saved_state_ = PyEval_SaveThread(); }
138-
139-
~PyReleaseGIL() { PyEval_RestoreThread(saved_state_); }
138+
PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {}
140139

141140
private:
142-
PyThreadState* saved_state_;
143-
ARROW_DISALLOW_COPY_AND_ASSIGN(PyReleaseGIL);
141+
static void unique_ptr_deleter(PyThreadState* state) {
142+
if (state) {
143+
PyEval_RestoreThread(state);
144+
}
145+
}
146+
std::unique_ptr<PyThreadState, decltype(&unique_ptr_deleter)> ptr_;
144147
};
145148

146149
// A helper to call safely into the Python interpreter from arbitrary C++ code.
@@ -235,6 +238,48 @@ class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
235238
}
236239
};
237240

241+
template <template <typename...> typename SmartPtr, typename... Ts>
242+
class SmartPtrNoGIL : public SmartPtr<Ts...> {
243+
using Base = SmartPtr<Ts...>;
244+
245+
public:
246+
template <typename... Args>
247+
SmartPtrNoGIL(Args&&... args) : Base(std::forward<Args>(args)...) {}
248+
249+
~SmartPtrNoGIL() { reset(); }
250+
251+
template <typename... Args>
252+
void reset(Args&&... args) {
253+
auto release_guard = optional_gil_release();
254+
Base::reset(std::forward<Args>(args)...);
255+
}
256+
257+
template <typename V>
258+
SmartPtrNoGIL& operator=(V&& v) {
259+
auto release_guard = optional_gil_release();
260+
Base::operator=(std::forward<V>(v));
261+
return *this;
262+
}
263+
264+
private:
265+
// Only release the GIL if we own an object *and* the Python runtime is
266+
// valid *and* the GIL is held.
267+
std::optional<PyReleaseGIL> optional_gil_release() const {
268+
if (this->get() != nullptr && Py_IsInitialized() && PyGILState_Check()) {
269+
return PyReleaseGIL();
270+
}
271+
return {};
272+
}
273+
};
274+
275+
/// \brief A std::shared_ptr<T, ...> subclass that releases the GIL when destroying T
276+
template <typename... Ts>
277+
using SharedPtrNoGIL = SmartPtrNoGIL<std::shared_ptr, Ts...>;
278+
279+
/// \brief A std::unique_ptr<T, ...> subclass that releases the GIL when destroying T
280+
template <typename... Ts>
281+
using UniquePtrNoGIL = SmartPtrNoGIL<std::unique_ptr, Ts...>;
282+
238283
template <typename Fn>
239284
struct BoundFunction;
240285

python/pyarrow/tests/test_csv.py

+21
Original file line numberDiff line numberDiff line change
@@ -1970,3 +1970,24 @@ def test_write_csv_decimal(tmpdir, type_factory):
19701970
out = read_csv(tmpdir / "out.csv")
19711971

19721972
assert out.column('col').cast(type) == table.column('col')
1973+
1974+
1975+
def test_read_csv_gil_deadlock():
1976+
# GH-38676
1977+
# This test depends on several preconditions:
1978+
# - the CSV input is a Python file object
1979+
# - reading the CSV file produces an error
1980+
data = b"a,b,c"
1981+
1982+
class MyBytesIO(io.BytesIO):
1983+
def read(self, *args):
1984+
time.sleep(0.001)
1985+
return super().read(*args)
1986+
1987+
def readinto(self, *args):
1988+
time.sleep(0.001)
1989+
return super().readinto(*args)
1990+
1991+
for i in range(20):
1992+
with pytest.raises(pa.ArrowInvalid):
1993+
read_csv(MyBytesIO(data))

0 commit comments

Comments
 (0)