问pdfplumber处理pdf表格具体的参数设置？

提问于 2024-07-11 09:52:15

回答 0关注 0查看 154

我想提取整个页面，但是最后一行提出不上，使用PageImage得到如下图：

以下是源代码：

def pdf_to_df(file_path,start:int,end:int, del_cols:[]):
    '''
    params:
    file_path：需要提取表格的pdf文件的绝对路径
    start：出现表格的起始页码
    end：表格结束页码
    del_cols：需要移除的列
    '''
    with pdfplumber.open(file_path) as pdf1:

        df_result = pd.DataFrame()
        for i in range(start-1, end):
            page = pdf1.pages[i]
            # 参数设计
            table_settings = {
                "vertical_strategy": "lines",
                "horizontal_strategy": "lines",
                "explicit_vertical_lines": [],
                "explicit_horizontal_lines": [],
                "snap_tolerance": 0,
                "snap_x_tolerance": 0,
                "snap_y_tolerance": 0,
                "join_tolerance": 0,
                "join_x_tolerance": 0,
                "join_y_tolerance": 0,
                "edge_min_length": 0,
                "min_words_vertical": 0,
                "min_words_horizontal": 0,
                "text_tolerance": 1,
                "text_x_tolerance": 1,
                "text_y_tolerance": 1,
                "intersection_tolerance": 0,
                "intersection_x_tolerance": 0,
                "intersection_y_tolerance": 50,
            }
            # 自动读取表格信息，返回列表
            tables = page.extract_table(table_settings)
            df_result = pd.concat([df_result, pd.DataFrame(tables)])

            im = page.to_image(resolution=150)
            im.reset().debug_tablefinder(table_settings)  # 显示表格
            im.show()

    df_result.drop_duplicates(inplace=True)  # 去除重复行
    df_result = df_result.drop(del_cols,axis=1)  # 删除第1，3列空值
    return df_result

python