From d9dfaa9d1d7d5bb1c81b3c32628c81693edfd9dd Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu <snitish.iitk@gmail.com> Date: Mon, 2 Dec 2024 16:16:33 -0800 Subject: [PATCH] BUG: Fix pd.read_html handling of rowspan in table header (#60464) * BUG: Fix pd.read_html handling of rowspan in table header * BUG: Fix docstring error in _expand_colspan_rowspan * BUG: Update return type for _expand_colspan_rowspan * BUG: Address review and add not to whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/html.py | 58 ++++++++++++++++++++++------------ pandas/tests/io/test_html.py | 27 ++++++++++++++++ 3 files changed, 66 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f4e7281ca0..83638ce87f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -701,6 +701,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) +- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/html.py b/pandas/io/html.py index c9897f628f..183af3a032 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -454,15 +454,26 @@ class _HtmlFrameParser: while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - header = self._expand_colspan_rowspan(header_rows, section="header") - body = self._expand_colspan_rowspan(body_rows, section="body") - footer = self._expand_colspan_rowspan(footer_rows, section="footer") + header, rem = self._expand_colspan_rowspan(header_rows, section="header") + body, rem = self._expand_colspan_rowspan( + body_rows, + section="body", + remainder=rem, + overflow=len(footer_rows) > 0, + ) + footer, _ = self._expand_colspan_rowspan( + footer_rows, section="footer", remainder=rem, overflow=False + ) return header, body, footer def _expand_colspan_rowspan( - self, rows, section: Literal["header", "footer", "body"] - ) -> list[list]: + self, + rows, + section: Literal["header", "footer", "body"], + remainder: list[tuple[int, str | tuple, int]] | None = None, + overflow: bool = True, + ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]: """ Given a list of <tr>s, return a list of text rows. @@ -471,12 +482,20 @@ class _HtmlFrameParser: rows : list of node-like List of <tr>s section : the section that the rows belong to (header, body or footer). + remainder: list[tuple[int, str | tuple, int]] | None + Any remainder from the expansion of previous section + overflow: bool + If true, return any partial rows as 'remainder'. If not, use up any + partial rows. True by default. Returns ------- list of list Each returned row is a list of str text, or tuple (text, link) if extract_links is not None. + remainder + Remaining partial rows if any. If overflow is False, an empty list + is returned. Notes ----- @@ -485,9 +504,7 @@ class _HtmlFrameParser: """ all_texts = [] # list of rows, each a list of str text: str | tuple - remainder: list[ - tuple[int, str | tuple, int] - ] = [] # list of (index, text, nrows) + remainder = remainder if remainder is not None else [] for tr in rows: texts = [] # the output for this row @@ -528,19 +545,20 @@ class _HtmlFrameParser: all_texts.append(texts) remainder = next_remainder - # Append rows that only appear because the previous row had non-1 - # rowspan - while remainder: - next_remainder = [] - texts = [] - for prev_i, prev_text, prev_rowspan in remainder: - texts.append(prev_text) - if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) - all_texts.append(texts) - remainder = next_remainder + if not overflow: + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder - return all_texts + return all_texts, remainder def _handle_hidden_tables(self, tbl_list, attr_name: str): """ diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 73e9933e36..bef28c4f02 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1004,6 +1004,33 @@ class TestReadHtml: tm.assert_frame_equal(result, expected) + def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html): + # GH60210 + + result = flavor_read_html( + StringIO( + """ + <table> + <tr> + <th rowspan="2">A</th> + <th>B</th> + </tr> + <tr> + <td>1</td> + </tr> + <tr> + <td>C</td> + <td>2</td> + </tr> + </table> + """ + ) + )[0] + + expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"]) + + tm.assert_frame_equal(result, expected) + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 result = flavor_read_html( -- GitLab