Skip to content

Commit 5b10ba2

Browse files
mraabhijitdependabot[bot]VYaswanthKumarNavya1707krishna-datta
authored
FIX: itemsize wrong for date32[day][pyarrow] dtype #57948 (#62657)
Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yaswanth Kumar <155723049+VYaswanthKumar@users.noreply.github.com> Co-authored-by: Navya Srivastava <143343265+Navya1707@users.noreply.github.com> Co-authored-by: krishna datta <19500807+krishna-datta@users.noreply.github.com> Co-authored-by: ZA1815 <zaahme18@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akashisang <151737560+Akashisang@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: BreezeLune <1066773178@qq.com> Co-authored-by: jbrockmendel <jbrockmendel@gmail.com> Co-authored-by: Aokizy2 <3441854632@qq.com> Co-authored-by: aokizy <14817191+aokizy2@user.noreply.gitee.com> Co-authored-by: Sumeet Bhatnagar <69593471+nemo-1999@users.noreply.github.com>
1 parent b477b87 commit 5b10ba2

File tree

2 files changed

+95
-2
lines changed

2 files changed

+95
-2
lines changed

pandas/core/dtypes/dtypes.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2308,8 +2308,35 @@ def kind(self) -> str:
23082308

23092309
@cache_readonly
23102310
def itemsize(self) -> int:
2311-
"""Return the number of bytes in this dtype"""
2312-
return self.numpy_dtype.itemsize
2311+
"""
2312+
Return the number of bytes in this dtype.
2313+
2314+
For Arrow-backed dtypes:
2315+
- Returns the fixed-width bit size divided by 8 for standard fixed-width types.
2316+
- For boolean types, returns the NumPy itemsize.
2317+
- Falls back to the NumPy dtype itemsize for variable-width & unsupported types.
2318+
2319+
Examples
2320+
--------
2321+
>>> import pyarrow as pa
2322+
>>> import pandas as pd
2323+
>>> dtype = pd.ArrowDtype(pa.int32())
2324+
>>> dtype.itemsize
2325+
4
2326+
2327+
>>> dtype = pd.ArrowDtype(pa.bool_())
2328+
>>> dtype.itemsize # falls back to numpy dtype
2329+
1
2330+
"""
2331+
if pa.types.is_boolean(self.pyarrow_dtype):
2332+
return self.numpy_dtype.itemsize
2333+
2334+
# Use pyarrow itemsize for fixed-width data types
2335+
# e.g. int32 -> 32 bits // 8 = 4 bytes
2336+
try:
2337+
return self.pyarrow_dtype.bit_width // 8
2338+
except (ValueError, AttributeError, NotImplementedError):
2339+
return self.numpy_dtype.itemsize
23132340

23142341
def construct_array_type(self) -> type_t[ArrowExtensionArray]:
23152342
"""

pandas/tests/extension/test_arrow.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3702,6 +3702,72 @@ def test_pow_with_all_na_float():
37023702
tm.assert_series_equal(result, expected)
37033703

37043704

3705+
@pytest.mark.parametrize(
3706+
"type_name, expected_size",
3707+
[
3708+
# Integer types
3709+
("int8", 1),
3710+
("int16", 2),
3711+
("int32", 4),
3712+
("int64", 8),
3713+
("uint8", 1),
3714+
("uint16", 2),
3715+
("uint32", 4),
3716+
("uint64", 8),
3717+
# Floating point types
3718+
("float16", 2),
3719+
("float32", 4),
3720+
("float64", 8),
3721+
# Boolean
3722+
("bool_", 1),
3723+
# Date and timestamp types
3724+
("date32", 4),
3725+
("date64", 8),
3726+
("timestamp", 8),
3727+
# Time types
3728+
("time32", 4),
3729+
("time64", 8),
3730+
# Decimal types
3731+
("decimal128", 16),
3732+
("decimal256", 32),
3733+
],
3734+
)
3735+
def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
3736+
# GH 57948
3737+
3738+
parametric_type_map = {
3739+
"timestamp": pa.timestamp("ns"),
3740+
"time32": pa.time32("s"),
3741+
"time64": pa.time64("ns"),
3742+
"decimal128": pa.decimal128(38, 10),
3743+
"decimal256": pa.decimal256(76, 10),
3744+
}
3745+
3746+
if type_name in parametric_type_map:
3747+
arrow_type = parametric_type_map.get(type_name)
3748+
else:
3749+
arrow_type = getattr(pa, type_name)()
3750+
dtype = ArrowDtype(arrow_type)
3751+
3752+
if type_name == "bool_":
3753+
expected_size = dtype.numpy_dtype.itemsize
3754+
3755+
assert dtype.itemsize == expected_size, (
3756+
f"{type_name} expected {expected_size}, got {dtype.itemsize} "
3757+
f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
3758+
)
3759+
3760+
3761+
@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
3762+
def test_arrow_dtype_itemsize_variable_width(type_name):
3763+
# GH 57948
3764+
3765+
arrow_type = getattr(pa, type_name)()
3766+
dtype = ArrowDtype(arrow_type)
3767+
3768+
assert dtype.itemsize == dtype.numpy_dtype.itemsize
3769+
3770+
37053771
def test_cast_pontwise_result_decimal_nan():
37063772
# GH#62522 we don't want to get back null[pyarrow] here
37073773
ser = pd.Series([], dtype="float64[pyarrow]")

0 commit comments

Comments
 (0)