问题1:客户反馈,数据量大的时候,用筛选指令会比较慢。例如下面这两个帖子:
问题2:希望支持复杂的筛选条件。如果是用《筛选》指令筛选多列,要用到多个筛选。例如:
如果用《df表格过滤(多条件)》指令,各条件直接只能是”并且“或者是”或“的关系,如果是(A&B)|C,是不支持的。
对于大规模、复杂的数据处理任务,可以用pandas。
import pandas as pd
def filter_dataframe_complex(list1, conditions,logical_expression):
df = pd.DataFrame(list1[1:],columns=list1[0])
# 生成每个条件的布尔Series
masks = []
for condition in conditions:
column = condition['column']
cond_type = condition['condition']
keyword = condition['keyword']
if keyword==None:
keyword = keyword
elif isinstance(keyword, (int, float)) == False:
keyword = f"'{keyword}'"
if cond_type == 'contains':
mask = f"df['{column}'].str.contains({keyword})"
elif cond_type == '=':
if keyword == None:
mask = f"df['{column}'].isnull()"
else:
mask = f"df['{column}'] == {keyword}"
elif cond_type == '!=':
if keyword == None:
mask = f"df['{column}'].notnull()"
else:
mask = f"df['{column}'] != {keyword}"
elif cond_type == '>':
mask = f"df['{column}'] > {keyword}"
elif cond_type == '<':
mask = f"df['{column}'] < {keyword}"
elif cond_type == '>=':
mask = f"df['{column}'] >={keyword}"
elif cond_type == '<=':
mask = f"df['{column}'] <= {keyword}"
elif cond_type == 'not_contains':
mask = f"~df['{column}'].str.contains({keyword})"
elif cond_type == 'startswith':
mask = f"df['{column}'].str.startswith({keyword})"
elif cond_type == 'endswith':
mask = f"df['{column}'].str.endswith({keyword})"
elif cond_type == 'not_startswith':
mask = f"~df['{column}'].str.startswith({keyword})"
elif cond_type == 'not_endswith':
mask = f"~df['{column}'].str.endswith({keyword})"
else:
raise ValueError(f"Unsupported condition type: {cond_type}")
masks.append(mask)
# print(masks)
for i,item in enumerate(masks):
logical_expression=logical_expression.replace("条件"+str(i+1),item)
logical_expression = eval(logical_expression)
# print(logical_expression)
# 应用筛选条件
filtered_df = df.iloc[df.index[logical_expression]]
# 将结果转换为列表
result_list = filtered_df.values.tolist()
return result_list
list1=[['country','population','city','region','year'],
['中国',2154,'北京',None,2001],
['美国',840,'纽约','北美',2005],
['日本',1266,'东京','东亚',2003],
['英国',678,'伦敦','欧洲',2010],
['中国',2154,'北京',None,2001]]
#condition示例
#'=','!=','>','<','>=','<=','contains','not_contains','startswith','endswith','not_startswith','not_endswith'
#logical_expression示例
#'(条件1)'
#'(条件1) & (条件2)'
#'((条件1) & (条件2))|(条件3)'
#'((条件1) & (条件2))|((条件3)&(条件4))'
# 示例条件
conditions = [
{'column': 'region', 'condition': 'startswith', 'keyword': "北"},
{'column': 'year', 'condition': '>=', 'keyword': 2000},
{'column': 'city', 'condition': 'endswith', 'keyword': '京'}
]
# 示例逻辑组
logical_expression = '((条件1) & (条件2))|(条件3)'
# 调用函数
result = filter_dataframe_complex(list1, conditions, logical_expression)
# 打印结果
print(result)
#结果:[['中国', 2154, '北京', None, 2001], ['美国', 840, '纽约', '北美', 2005], ['日本', 1266, '东京', '东亚', 2003], ['中国', 2154, '北京', None, 2001]]