EXCEL表格拆分工具源码(某列按关键词拆分)

python · 06-19

该代码主要用于拆分清单中不同县部门的部分,后来优化了一下,使其可以适用于不同表格,也算比较实用,不同于方方格子的拆分是,方方格子的拆分适用于单元格内内容固定,该脚本逻辑是“只要包含某个关键词就拆分入其中”,并且会将未拆分过的单独列出,方便检查。

import tkinter as tk
from tkinterdnd2 import DND_FILES, TkinterDnD
from tkinter import messagebox, ttk
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl.utils import column_index_from_string, get_column_letter
from copy import copy
import os
import threading
import concurrent.futures
import time

# 定义拆分列号识别的关键词
KEYWORDS = ["县级指导部门", "上级部门(单位)", "承接部门"]

# 全局变量存储自动识别结果
auto_headers = {}
auto_split_cols = {}

# 创建主窗口
root = TkinterDnD.Tk()
root.title("履职清单拆分器 智能识别版 v1.3 by杨海")
root.geometry("600x700")

# 创建Canvas和Scrollbar
canvas = tk.Canvas(root)
scrollbar = tk.Scrollbar(root, orient="vertical", command=canvas.yview)
canvas.configure(yscrollcommand=scrollbar.set)

# 创建一个Frame来放置所有组件
main_frame = tk.Frame(canvas)

# 将main_frame添加到canvas中
canvas.create_window((0, 0), window=main_frame, anchor="nw")

# 绑定滚动事件
def on_mousewheel(event):
    canvas.yview_scroll(int(-1*(event.delta/120)), "units")

canvas.bind_all("<MouseWheel>", on_mousewheel)

# 放置canvas和scrollbar
canvas.pack(side="left", fill="both", expand=True)
scrollbar.pack(side="right", fill="y")

# 更新canvas的滚动区域
def update_scrollregion(event):
    canvas.configure(scrollregion=canvas.bbox("all"))

main_frame.bind("<Configure>", update_scrollregion)

# 拖拽区域
drop_frame = tk.Frame(main_frame, bg="lightgray", relief="raised", bd=2)
drop_frame.pack(pady=20, padx=20, fill="x")
drop_label = tk.Label(drop_frame, text="拖拽 Excel 文件到此处", bg="lightgray", font=("黑体", 12), height=5)
drop_label.pack(expand=True)
file_path = None

def on_drop(event):
    global file_path
    file_path = event.data.strip('{}')
    drop_label.config(text=f"已上传文件: {os.path.basename(file_path)}")
    load_sheets()
    update_columns_entries()
    status_label.config(text="请勾选需要拆分的表格")

drop_label.drop_target_register(DND_FILES)
drop_label.dnd_bind("<<Drop>>", on_drop)

# 工作表选择区域
sheets_frame = tk.LabelFrame(main_frame, text="选择需要拆分的工作表", font=("黑体", 11), padx=10, pady=10)
sheets_frame.pack(pady=10, padx=20, fill="both")
sheets_vars = []

def load_sheets():
    global file_path, sheets_vars, auto_headers, auto_split_cols
    if not file_path: return
    try:
        wb = load_workbook(file_path)
        sheets = wb.sheetnames
        for w in sheets_frame.winfo_children(): w.destroy()
        sheets_vars.clear()
        auto_headers.clear()
        auto_split_cols.clear()
        for sheet in sheets:
            var = tk.BooleanVar()
            chk = tk.Checkbutton(sheets_frame, text=sheet, variable=var, font=("黑体", 10), command=update_columns_entries)
            chk.pack(anchor='w')
            sheets_vars.append((sheet, var))

            ws = wb[sheet]
            # 自动识别表头行数
            header_rows = None
            for r in range(1, ws.max_row+1):
                cell = ws.cell(r,1)
                if isinstance(cell.value, (int,float)):
                    header_rows = r-1
                    break
            auto_headers[sheet] = header_rows

            # 自动识别拆分列
            split_col = None
            found_cell = None
            if header_rows:
                for c in range(1, ws.max_column+1):
                    for r in range(1, header_rows+1):
                        v = ws.cell(r,c).value
                        if v and any(k in str(v) for k in KEYWORDS):
                            split_col = get_column_letter(c)
                            found_cell = (r, split_col)
                            break
                    if split_col: break
            auto_split_cols[sheet] = (split_col, found_cell) if split_col else None

    except Exception as e:
        messagebox.showerror("错误", f"无法加载 Excel 文件: {e}")

# 列和表头输入区域
columns_frame = tk.LabelFrame(main_frame, text="设置拆分参数", font=("黑体", 11), padx=10, pady=10)
columns_frame.pack(pady=10, padx=20, fill="both")
columns_entries = {}
headers_entries = {}

def update_columns_entries():
    for w in columns_frame.winfo_children(): w.destroy()
    selected = [s for s,v in sheets_vars if v.get()]
    if not selected:
        tk.Label(columns_frame, text="请勾选需要拆分的表格", font=("黑体",10,"italic")).pack(pady=20)
        status_label.config(text="请勾选需要拆分的表格")
    else:
        status_label.config(text="请确认输入并点击确认")
        for sheet in selected:
            sf = tk.Frame(columns_frame, relief="groove", bd=1); sf.pack(pady=5, fill="x")
            tk.Label(sf, text=f"工作表: {sheet}", font=("黑体",10,"bold")).pack(anchor="w", pady=2)
            # 拆分列
            cf = tk.Frame(sf); cf.pack(fill="x", pady=2)
            tk.Label(cf, text="拆分列号 (如 A, B):", font=("黑体",10)).pack(side="left")
            ec = tk.Entry(cf, width=5, font=("黑体",10)); ec.pack(side="left", padx=5)
            auto = auto_split_cols.get(sheet)
            if auto:
                sc, fc = auto
                ec.insert(0, sc)
                txt = f"(智能识别: {sc} 列"
                if fc: txt += f",基于{fc[0]}行)"
                else: txt += ")"
                tk.Label(cf, text=txt, font=("黑体",10), fg="green").pack(side="left", padx=5)
            else:
                tk.Label(cf, text="(智能识别失败)", font=("黑体",10,"italic"), fg="red").pack(side="left", padx=5)

            # 表头行数
            hf = tk.Frame(sf); hf.pack(fill="x", pady=2)
            tk.Label(hf, text="表头行数:", font=("黑体",10)).pack(side="left")
            eh = tk.Entry(hf, width=5, font=("黑体",10)); eh.pack(side="left", padx=5)
            ah = auto_headers.get(sheet)
            if ah is not None:
                eh.insert(0,str(ah))
                tk.Label(hf, text=f"(智能识别: {ah} 行)", font=("黑体",10), fg="green").pack(side="left", padx=5)
            else:
                tk.Label(hf, text="(智能识别失败)", font=("黑体",10,"italic"), fg="red").pack(side="left", padx=5)

            columns_entries[sheet] = ec
            headers_entries[sheet] = eh

# 关键词输入区域
keywords_frame = tk.LabelFrame(main_frame, text="输入拆分关键词", font=("黑体",11), padx=10, pady=10)
keywords_frame.pack(pady=10, padx=20, fill="x")
tk.Label(keywords_frame, text="关键词(以 @ 分隔):", font=("黑体",10)).pack()
keywords_entry = tk.Entry(keywords_frame, font=("黑体",10))
keywords_entry.pack(fill="x")
add_serial_var = tk.BooleanVar()
add_serial_check = tk.Checkbutton(keywords_frame, text="在拆分后的文件名前添加序号", variable=add_serial_var, font=("黑体",10))
add_serial_check.pack(pady=5)

# 保存方式选择区域
save_mode_frame = tk.LabelFrame(main_frame, text="选择保存方式", font=("黑体",11), padx=10, pady=10)
save_mode_frame.pack(pady=10, padx=20, fill="x")
save_mode_var = tk.StringVar(value="separate")
tk.Radiobutton(save_mode_frame, text="每个拆分结果保存为一个工作簿", variable=save_mode_var, value="separate", font=("黑体",10)).pack(anchor="w")
tk.Radiobutton(save_mode_frame, text="所有拆分结果保存在一个工作簿中", variable=save_mode_var, value="combined", font=("黑体",10)).pack(anchor="w")

# 线程数输入区域
threads_frame = tk.LabelFrame(main_frame, text="拆分线程数", font=("黑体",11), padx=10, pady=10)
threads_frame.pack(pady=10, padx=20, fill="x")
tk.Label(threads_frame, text="线程数(默认智能判断):", font=("黑体",10)).pack(side="left")
threads_entry = tk.Entry(threads_frame, width=5, font=("黑体",10))
threads_entry.pack(side="left", padx=5)
default_threads = os.cpu_count() or 1
threads_entry.insert(0, str(default_threads))
tk.Label(threads_frame, text=f"(智能判断: {default_threads} 线程)", font=("黑体",10), fg="green").pack(side="left", padx=5)

# 进度条区域
progress_frame = tk.Frame(main_frame)
progress_frame.pack(pady=10)
progress_var = tk.DoubleVar()
progress_bar = ttk.Progressbar(progress_frame, variable=progress_var, maximum=100)
progress_bar.pack(side="left", fill="x", expand=True)
progress_label = tk.Label(progress_frame, text="0%", font=("黑体", 10))
progress_label.pack(side="left", padx=5)

# 按钮和状态
button_frame = tk.Frame(main_frame); button_frame.pack(pady=10)
confirm_button = tk.Button(button_frame, text="确认拆分", command=lambda: process(), font=("黑体",10), bg="lightgreen")
confirm_button.pack(side="left", padx=5)
status_label = tk.Label(main_frame, text="请拖拽 Excel 文件到此处", font=("黑体",10,"italic"), fg="gray")
status_label.pack(pady=10)

def process():
    global file_path
    if not file_path:
        messagebox.showerror("错误", "未上传文件")
        return

    selected = [s for s,v in sheets_vars if v.get()]
    if not selected:
        messagebox.showerror("错误", "未选择任何工作表")
        return

    # 读取用户输入参数
    columns = {}
    headers = {}
    for sheet in selected:
        try:
            col = column_index_from_string(columns_entries[sheet].get().strip().upper())
            columns[sheet] = col
        except:
            messagebox.showerror("错误", f"{sheet} 列号无效")
            return
        try:
            hr = int(headers_entries[sheet].get().strip())
            headers[sheet] = hr
        except:
            messagebox.showerror("错误", f"{sheet} 表头行数无效")
            return

    kws = [k.strip() for k in keywords_entry.get().split("@") if k.strip()]
    if not kws:
        messagebox.showerror("错误", "未输入关键词")
        return

    # 读取线程数
    try:
        threads = int(threads_entry.get().strip())
        if threads < 1:
            threads = default_threads
    except:
        threads = default_threads

    # 禁用确认按钮,防止重复点击
    confirm_button.config(state="disabled")

    # 启动线程
    threading.Thread(target=process_thread, args=(selected, columns, headers, kws, threads)).start()

def process_thread(selected, columns, headers, kws, threads):
    try:
        # 准备输出
        result_dir = "拆分结果"
        os.makedirs(result_dir, exist_ok=True)
        wb = load_workbook(file_path)
        split_rows = {s:set() for s in selected}

        total_keywords = len(kws)
        completed_keywords = 0
        lock = threading.Lock()
        add_serial = add_serial_var.get()
        save_mode = save_mode_var.get()

        if save_mode == "combined":
            new_wb = Workbook()
            if new_wb.sheetnames: new_wb.remove(new_wb[new_wb.sheetnames[0]])
            for idx, keyword in enumerate(kws, start=1):
                for sheet in selected:
                    original = wb[sheet]
                    ws_name = f"{keyword}_{sheet}"
                    if add_serial:
                        ws_name = f"{idx:02d}_{ws_name}"
                    new_ws = new_wb.create_sheet(title=ws_name)
                    hr = headers[sheet]
                    col_num = columns[sheet]

                    # 复制列宽
                    for c in range(1, original.max_column+1):
                        letter = get_column_letter(c)
                        wd = original.column_dimensions[letter].width
                        if wd: new_ws.column_dimensions[letter].width = wd

                    # 复制“顶端标题行”合并和内容
                    for merge in original.merged_cells.ranges:
                        if merge.min_row<=hr and merge.max_row<=hr:
                            new_ws.merge_cells(
                                start_row=merge.min_row, start_column=merge.min_col,
                                end_row=merge.max_row,   end_column=merge.max_col
                            )
                    for r in range(1, hr+1):
                        for c in range(1, original.max_column+1):
                            oc = original.cell(r,c)
                            nc = new_ws.cell(r,c,value=oc.value)
                            if oc.has_style:
                                nc.font           = copy(oc.font)
                                nc.border         = copy(oc.border)
                                nc.fill           = copy(oc.fill)
                                nc.number_format  = copy(oc.number_format)
                                nc.protection     = copy(oc.protection)
                                nc.alignment      = copy(oc.alignment)
                        rh = original.row_dimensions[r].height
                        if rh: new_ws.row_dimensions[r].height = rh

                    # 复制打印及页面设置
                    new_ws.page_setup = copy(original.page_setup)
                    new_ws.page_margins = copy(original.page_margins)
                    new_ws.print_options = copy(original.print_options)
                    new_ws.print_title_rows = original.print_title_rows
                    new_ws.print_title_cols = original.print_title_cols

                    # 按块复制数据行并保持合并
                    merged_regions = [
                        mr for mr in original.merged_cells.ranges
                        if mr.min_row > hr
                    ]
                    row_to_merge = {}
                    for mr in merged_regions:
                        for r in range(mr.min_row, mr.max_row+1):
                            row_to_merge[r] = mr

                    orig_to_new = {}
                    next_row = hr + 1
                    has_data = False

                    r = hr + 1
                    while r <= original.max_row:
                        mr = row_to_merge.get(r)
                        if mr and r != mr.min_row:
                            r += 1
                            continue

                        if mr:
                            region = list(range(mr.min_row, mr.max_row+1))
                            check_r = mr.min_row
                        else:
                            region = [r]
                            check_r = r

                        val = original.cell(check_r, col_num).value
                        if val and str(keyword) in str(val):
                            for orow in region:
                                for c in range(1, original.max_column+1):
                                    oc = original.cell(orow,c)
                                    nc = new_ws.cell(row=next_row, column=c, value=oc.value)
                                    if oc.has_style:
                                        nc.font           = copy(oc.font)
                                        nc.border         = copy(oc.border)
                                        nc.fill           = copy(oc.fill)
                                        nc.number_format  = copy(oc.number_format)
                                        nc.protection     = copy(oc.protection)
                                        nc.alignment      = copy(oc.alignment)
                                orig_to_new[orow] = next_row
                                rh = original.row_dimensions[orow].height
                                if rh: new_ws.row_dimensions[next_row].height = rh
                                next_row += 1
                            has_data = True

                        r = region[-1] + 1

                    # 重现数据区的合并单元格
                    for mr in merged_regions:
                        if all(rr in orig_to_new for rr in range(mr.min_row, mr.max_row+1)):
                            start_n = orig_to_new[mr.min_row]
                            end_n   = orig_to_new[mr.max_row]
                            new_ws.merge_cells(
                                start_row=start_n, start_column=mr.min_col,
                                end_row=end_n,   end_column=mr.max_col
                            )

                    if not has_data:
                        new_wb.remove(new_ws)
                    else:
                        with lock:
                            split_rows[sheet].update(orig_to_new.keys())

                with lock:
                    completed_keywords += 1
                    progress = (completed_keywords / total_keywords) * 100
                    root.after(0, update_progress, progress)

            if new_wb.sheetnames:
                new_wb.save(os.path.join(result_dir, "combined.xlsx"))

        else:
            def split_keyword(keyword, idx):
                nonlocal completed_keywords
                new_wb = Workbook()
                if new_wb.sheetnames: new_wb.remove(new_wb[new_wb.sheetnames[0]])

                for sheet in selected:
                    original = wb[sheet]
                    new_ws = new_wb.create_sheet(title=sheet)
                    hr = headers[sheet]
                    col_num = columns[sheet]

                    # 复制列宽
                    for c in range(1, original.max_column+1):
                        letter = get_column_letter(c)
                        wd = original.column_dimensions[letter].width
                        if wd: new_ws.column_dimensions[letter].width = wd

                    # 复制“顶端标题行”合并和内容
                    for merge in original.merged_cells.ranges:
                        if merge.min_row<=hr and merge.max_row<=hr:
                            new_ws.merge_cells(
                                start_row=merge.min_row, start_column=merge.min_col,
                                end_row=merge.max_row,   end_column=merge.max_col
                            )
                    for r in range(1, hr+1):
                        for c in range(1, original.max_column+1):
                            oc = original.cell(r,c)
                            nc = new_ws.cell(r,c,value=oc.value)
                            if oc.has_style:
                                nc.font           = copy(oc.font)
                                nc.border         = copy(oc.border)
                                nc.fill           = copy(oc.fill)
                                nc.number_format  = copy(oc.number_format)
                                nc.protection     = copy(oc.protection)
                                nc.alignment      = copy(oc.alignment)
                        rh = original.row_dimensions[r].height
                        if rh: new_ws.row_dimensions[r].height = rh

                    # 复制打印及页面设置
                    new_ws.page_setup = copy(original.page_setup)
                    new_ws.page_margins = copy(original.page_margins)
                    new_ws.print_options = copy(original.print_options)
                    new_ws.print_title_rows = original.print_title_rows
                    new_ws.print_title_cols = original.print_title_cols

                    # 按块复制数据行并保持合并
                    merged_regions = [
                        mr for mr in original.merged_cells.ranges
                        if mr.min_row > hr
                    ]
                    row_to_merge = {}
                    for mr in merged_regions:
                        for r in range(mr.min_row, mr.max_row+1):
                            row_to_merge[r] = mr

                    orig_to_new = {}
                    next_row = hr + 1
                    has_data = False

                    r = hr + 1
                    while r <= original.max_row:
                        mr = row_to_merge.get(r)
                        if mr and r != mr.min_row:
                            r += 1
                            continue

                        if mr:
                            region = list(range(mr.min_row, mr.max_row+1))
                            check_r = mr.min_row
                        else:
                            region = [r]
                            check_r = r

                        val = original.cell(check_r, col_num).value
                        if val and str(keyword) in str(val):
                            for orow in region:
                                for c in range(1, original.max_column+1):
                                    oc = original.cell(orow,c)
                                    nc = new_ws.cell(row=next_row, column=c, value=oc.value)
                                    if oc.has_style:
                                        nc.font           = copy(oc.font)
                                        nc.border         = copy(oc.border)
                                        nc.fill           = copy(oc.fill)
                                        nc.number_format  = copy(oc.number_format)
                                        nc.protection     = copy(oc.protection)
                                        nc.alignment      = copy(oc.alignment)
                                orig_to_new[orow] = next_row
                                rh = original.row_dimensions[orow].height
                                if rh: new_ws.row_dimensions[next_row].height = rh
                                next_row += 1
                            has_data = True

                        r = region[-1] + 1

                    # 重现数据区的合并单元格
                    for mr in merged_regions:
                        if all(rr in orig_to_new for rr in range(mr.min_row, mr.max_row+1)):
                            start_n = orig_to_new[mr.min_row]
                            end_n   = orig_to_new[mr.max_row]
                            new_ws.merge_cells(
                                start_row=start_n, start_column=mr.min_col,
                                end_row=end_n,   end_column=mr.max_col
                            )

                    if not has_data:
                        new_wb.remove(new_ws)
                    else:
                        with lock:
                            split_rows[sheet].update(orig_to_new.keys())

                if new_wb.sheetnames:
                    safe = keyword.replace("/","_").replace("\\","_")
                    if add_serial:
                        safe = f"{idx:02d}_{safe}"
                    new_wb.save(os.path.join(result_dir, f"{safe}.xlsx"))

                with lock:
                    completed_keywords += 1
                    progress = (completed_keywords / total_keywords) * 100
                    root.after(0, update_progress, progress)

            with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
                executor.map(lambda x: split_keyword(x[1], x[0]+1), enumerate(kws))

        # 处理未拆分行
        unsplit_list = []
        for sheet in selected:
            ws = wb[sheet]
            hr = headers[sheet]
            all_rows = set(range(hr+1, ws.max_row+1))
            left = all_rows - split_rows[sheet]
            if left:
                unsplit_list.append((sheet, left))

        if unsplit_list:
            uwb = Workbook()
            if uwb.sheetnames: uwb.remove(uwb[uwb.sheetnames[0]])
            for sheet,left in unsplit_list:
                orig = wb[sheet]
                nws = uwb.create_sheet(title=f"{sheet}_unsplit")
                hr = headers[sheet]
                # 复制列宽 & 顶端标题
                for c in range(1, orig.max_column+1):
                    lt = get_column_letter(c)
                    wd = orig.column_dimensions[lt].width
                    if wd: nws.column_dimensions[lt].width = wd
                for mr in orig.merged_cells.ranges:
                    if mr.min_row<=hr and mr.max_row<=hr:
                        nws.merge_cells(
                            start_row=mr.min_row, start_column=mr.min_col,
                            end_row=mr.max_row,   end_column=mr.max_col
                        )
                for r in range(1,hr+1):
                    for c in range(1, orig.max_column+1):
                        oc = orig.cell(r,c)
                        nc = nws.cell(r,c,value=oc.value)
                        if oc.has_style:
                            nc.font           = copy(oc.font)
                            nc.border         = copy(oc.border)
                            nc.fill           = copy(oc.fill)
                            nc.number_format  = copy(oc.number_format)
                            nc.protection     = copy(oc.protection)
                            nc.alignment      = copy(oc.alignment)
                    rh = orig.row_dimensions[r].height
                    if rh: nws.row_dimensions[r].height = rh

                # 复制打印设置
                nws.page_setup = copy(orig.page_setup)
                nws.page_margins = copy(orig.page_margins)
                nws.print_options = copy(orig.print_options)
                nws.print_title_rows = orig.print_title_rows
                nws.print_title_cols = orig.print_title_cols

                # 复制未拆分数据行及合并
                mrs = [mr for mr in orig.merged_cells.ranges if mr.min_row>hr]
                r2new = {}
                nr = hr + 1
                for r in sorted(left):
                    for c in range(1, orig.max_column+1):
                        oc = orig.cell(r,c)
                        nc = nws.cell(row=nr, column=c, value=oc.value)
                        if oc.has_style:
                            nc.font           = copy(oc.font)
                            nc.border         = copy(oc.border)
                            nc.fill           = copy(oc.fill)
                            nc.number_format  = copy(oc.number_format)
                            nc.protection     = copy(oc.protection)
                            nc.alignment      = copy(oc.alignment)
                    r2new[r] = nr
                    rh = orig.row_dimensions[r].height
                    if rh: nws.row_dimensions[nr].height = rh
                    nr += 1
                for mr in mrs:
                    if all(r in r2new for r in range(mr.min_row, mr.max_row+1)):
                        nws.merge_cells(
                            start_row=r2new[mr.min_row], start_column=mr.min_col,
                            end_row=r2new[mr.max_row],   end_column=mr.max_col
                        )

            uwb.save(os.path.join(result_dir, "unsplit.xlsx"))
            root.after(0, messagebox.showwarning, "警告", "存在未拆分内容,请检查 unsplit.xlsx")
        else:
            root.after(0, messagebox.showinfo, "信息", "所有行均已拆分")

    except Exception as e:
        root.after(0, messagebox.showerror, "错误", f"拆分过程中出错: {e}")
    finally:
        root.after(0, confirm_button.config, {"state": "normal"})
        root.after(0, update_progress, 0)

def update_progress(value):
    progress_var.set(value)
    progress_label.config(text=f"{value:.1f}%")

# 运行 GUI
root.mainloop()
python EXCEL