Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
)
from pandas.util._validators import check_dtype_backend

from pandas.core.dtypes.common import is_file_like

from pandas import (
DataFrame,
get_option,
Expand Down Expand Up @@ -658,6 +660,13 @@ def read_parquet(
0 3 8
1 4 9
"""
# gh-62922: validate path type early to match documented API expectations
# and provide a consistent, clear user error immediately.
if not (isinstance(path, (str, os.PathLike)) or is_file_like(path)):
raise TypeError(
f"read_parquet expected str/os.PathLike or file-like object, "
f"got {type(path).__name__} type"
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be done using get_handle in the _get_path_or_handle function.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, thanks for the review!

I looked into _get_path_or_handle more closely.

get_handle is only invoked when we already know path_or_handle is a string and not a directory. For invalid inputs like a list, we never reach that branch as they get passed through stringify_path unchanged.

Also, _get_path_or_handle is only used in PyArrowImpl.read, not in FastParquetImpl.read. So if we only rely on _get_path_or_handle to validate input, validation coverage would be asymmetric across engines.

So, I propose to validate the path type in read_parquet before engine dispatch. Alternatively, I can factor a small _validate_parquet_path_arg(path) helper and call it at the top of both PyArrowImpl.read and FastParquetImpl.read

Let me know which placement you prefer.


impl = get_engine(engine)
check_dtype_backend(dtype_backend)
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,42 @@ def check_partition_names(path, expected):
assert dataset.partitioning.schema.names == expected


def test_read_parquet_invalid_path_types(tmp_path, engine):
# GH #62922
df = pd.DataFrame({"a": [1]})
path = tmp_path / "test_read_parquet.parquet"
df.to_parquet(path, engine=engine)

bad_path_types = [
[str(path)], # list
(str(path),), # tuple
b"raw-bytes", # bytes
]
Comment on lines +257 to +261
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Testing the list case is sufficient

for bad in bad_path_types:
match = (
f"read_parquet expected str/os.PathLike or file-like object, "
f"got {type(bad).__name__} type"
)
with pytest.raises(TypeError, match=match):
read_parquet(bad, engine=engine)


def test_read_parquet_valid_path_types(tmp_path, engine):
# GH #62922
df = pd.DataFrame({"a": [1]})
path = tmp_path / "test_read_parquet.parquet"
df.to_parquet(path, engine=engine)
# str
read_parquet(str(path), engine=engine)
# os.PathLike
read_parquet(pathlib.Path(path), engine=engine)
# file-like object
buf = BytesIO()
df.to_parquet(buf, engine=engine)
buf.seek(0)
read_parquet(buf, engine=engine)
Comment on lines +271 to +284
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is not needed



def test_invalid_engine(df_compat, temp_file):
msg = "engine must be one of 'pyarrow', 'fastparquet'"
with pytest.raises(ValueError, match=msg):
Expand Down
Loading