系列文章目录
文章目录
- 系列文章目录
- 前言
- 一、需求
- 二、使用步骤
- 1.源码
- 总结
前言
一、需求
判断所有指定文件是否重复,是否存在数据造假;
所有指定excel文件指定列求和,求最大值,保存到新excel
二、使用步骤
1.源码
代码如下(示例):
#xlwt只支持xls格式,xlsx格式需要用openpyxl或pandas
# -*- coding: utf-8 -*-
# coding=gbkimport pandas as pd
import os
import xlrd
import xlwt
import csv
from xlutils.copy import copy
from openpyxl import workbook
from openpyxl import load_workbook
from os.path import dirname
from decimal import Decimal
from openpyxl.utils.dataframe import dataframe_to_rows
# 读写2007 excel
import openpyxlglobal_var = None
global_var_r = Nonedef some_function():global global_varglobal global_var_rglobal_var = col1global_var_r = row1def get_allfile_msg(file_dir):for root, dirs, files in os.walk(file_dir):return root, dirs, [file for file in files if file.endswith('.xls') or file.endswith('.xlsx') or file.endswith('.csv')] def get_allfile_url(root, files):allFile_url = []for file_name in files:file_url = root + "/" + file_nameallFile_url.append(file_url)return allFile_urldef get_file_name(path, suffix = ['.xlsx', '.xls','.csv']): #'.xlsx', '.xls',tmp_lst = []for root,dirs,files in os.walk(path):for file in files:tmp_lst.append(os.path.join(root, file))return tmp_lstdef convert_csv_to_xlsx(csv_file_path,xlsx_file_path):df = pd.read_csv(csv_file_path)df.to_excel(xlsx_file_path,index=False) #定义读取csv_pandas
def read_csv_file(file_path):#参数:error_bad_lines=False跳过错误的行 delimiter=',',encoding = 'gbk',header = 0, engine='python' sep = r"\s+\s{0}" encoding = "iso-8859-1"return pd.read_csv(file_path,encoding = 'latin1',sep = r"\s+\s{0}",dtype=object,quotechar="'",delimiter=',',doublequote=True,engine="python",header = row_header) #第四行作为表头#定义一个函数来尝试将字符串转换为十进制数字
def try_convert_to_decimal(value):try:return Decimal(value)except (TypeError, DecimalException):return None#定义csv求和
def sum_csv_column(filename,column_index):total = 0with open(filename, mode='r', newline='') as csvfile: #mode = 'r'csvreader = csv.DictReader(csvfile, delimiter=",")headers = next(csvreader) #跳过标题行for row in csvreader:if len(row) > column_index : #and row[column_index].isdigit()total += int(row[column_index])return totaldef calculate_sum_in_csv(file_path,row_start,row_end,col_start,col_end):df = pd.read_csv(file_path ,encoding='utf-8')subset = df.iloc[row_start:row_end,col_start:col_end]return subset.sum().sum()def get_top_nine_dirs(file_path):#分割路径并移除文件名parts = file_path.split(os.path.sep)#获取前9级目录#top_nine_dirs = os.path.sep.join(parts[:9])#获取第9级目录top_nine_dirs = os.path.sep.join(parts[8])return top_nine_dirsdef get_nth_directory_name(file_path,n):#将路径分割成单个目录directories = file_path.split(os.path.sep)if len(directories) < n:return Nonereturn directories[n - 1]def is_number(string):try:Decimal(string) #尝试将字符串转换为数字return Trueexcept DecimalError:return False#查找并输出位置
def search_str(filename,search_char):global global_var #将读取的函数计算结果设置为全局变量,我们可以先定义全局变量,然后在函数内部通过全局变量声明修改global global_var_rresult = []character = "1S"total = 0 try:with open(filename,'r') as csvfile:csvreader = csv.reader(csvfile, delimiter=",") row_index = 1 for row in csvreader: col_index = 1for cell in row:if character == cell:result.append((row_index,col_index))print(result)print(f"Cell data:{cell},Row index:{row_index},Column index:{col_index}")print('已查到该值',cell)for row1,col1 in result:data_row = list(csvreader)num_rows = len(data_row)print(f'"{character}"在第{row1}行,第{col1}列被找到')print(f'总行数:"{num_rows}"')global_var = col1global_var_r = row1 - 2col_index += 1row_index += 1except:passif __name__ == '__main__':#file_dir = os.getcwd()file_dir = r"\\10.99.10.141\临时使用共享盘\品保勿删\跌落\2024年"#file_dir = r"E:\py\python3.7\test-advance\test04\data-mtf1"root, dirs, files = get_allfile_msg(file_dir)allFile_url = get_allfile_url(root, files)#print(root)print(dirs)#print(files)number = len(dirs)print(number)#**********************************************************jieguo = xlwt.Workbook(encoding="ascii") #生成excelwsheet = jieguo.add_sheet('sheet name') #生成sheet #keyword = 'L<1.2'#**********************************************************sum_values = []mean_values = []all_values = []#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ex = openpyxl.Workbook()sheet = ex.activestats = {'sum':[],'mean':[]}#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#``````````````````````````````````````````````````````````datas = []datas_add = []data_paths = []data_paths_part = []top_nine_dirs = []data_paths_name = []top_eight_dirs = []#``````````````````````````````````````````````````````````for n in range(len(dirs)):dir = dirs[n]path = root + '\\' + dirprint(path) print(len(dir))tmp_lst = get_file_name(path)fl_lst = list(filter(lambda x:x.find("Data0_mtf.csv")>=0,tmp_lst)) #fl_lst1 = ['r"'+item for item in fl_lst ]#fl_lst2 = [item +'"' for item in fl_lst1 ]num_file = len(fl_lst) print(num_file) #print(tmp_lst)#main()try:for xl in fl_lst:#xl = xl.replace(os.sep,"\\")print(xl) search_str(xl,"1S") filename = xlpartname = os.path.dirname(os.path.dirname(xl))column_index = global_var - 1column_index_add = global_varrow_header = global_var_r csv_data = read_csv_file(filename)df = csv_data#print(df)#print(df.iloc[:,column_index].dtype)df.iloc[:,column_index] = pd.to_numeric(df.iloc[:,column_index],errors='coerce')value_5_17 = df.iloc[0,16] #行索引以0开始 #print(value_5_17)sum_total = df.iloc[:,column_index].sum()max_total = df.iloc[:,column_index].max()#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^df.iloc[:,column_index_add] = pd.to_numeric(df.iloc[:,column_index_add],errors='coerce')sum_total_add = df.iloc[:,column_index_add].sum()max_total_add = df.iloc[:,column_index_add].max()datas_add.append({'sum_add':sum_total_add,'max_add':max_total_add})datas_add_df = pd.DataFrame(datas_add) #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^#sum_total = sum_values.append(df.iloc[:,column_index].sum())#mean_total = mean_values.append(df.iloc[:,column_index].mean()) #sum_df = pd.DataFrame(sum_values,columns=['sum_value'])#mean_df = pd.DataFrame(mean_values,columns=['mean_value'])#sum_df.to_excel('output.xlsx',sheet_name='Sheet1',startcol=1,startrow=0,index=False,header=False,engine='openpyxl')#mean_df.to_excel('output.xlsx',sheet_name='Sheet1',startcol=2,startrow=0,index=False,header=False,engine='openpyxl')'''#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#计算指定列和&平均值stats['sum'].append(sum_total)stats['mean'].append(mean_total)for row in dataframe_to_rows(pd.DataFrame(stats),index=False,header = True):sheet.append(row)ex.save("stats.xlsx")#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'''#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~datas.append({'sum':sum_total,'max':max_total})data_paths.append({'path':xl})#top_nine_dirs = get_top_nine_dirs(xl)top_eight_dirs = get_nth_directory_name(xl,10)top_nine_dirs = get_nth_directory_name(xl,11)data_paths_part.append({'part':top_nine_dirs})data_paths_name.append({'name':top_eight_dirs})datas_df = pd.DataFrame(datas)data_paths_df = pd.DataFrame(data_paths) data_paths_part_df = pd.DataFrame(data_paths_part)data_paths_name_df = pd.DataFrame(data_paths_name)with pd.ExcelWriter('results.xlsx') as writer:datas_df[['sum','max']].to_excel(writer,startrow =0, startcol = 4,index = False)datas_add_df[['sum_add','max_add']].to_excel(writer,startrow =0, startcol = 7,index = False)data_paths_df[['path']].to_excel(writer,startrow = 0,startcol =3,index=False)data_paths_part_df[['part']].to_excel(writer,startrow = 0,startcol = 2,index=False)data_paths_name_df[['name']].to_excel(writer,startrow = 0,startcol = 1,index=False)#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print(f"共有{num_file*len(dirs)}个文件")except Exception as e:print(e)
总结
分享
从道理上讲,烦恼都是由思维和想法产生的,冥想的状态可以让我们看清这些思维和想法,并且放下它们,回归平静;