import re
import chardet
def clean_file(file_path, output_path, replace_char=','):
with open(file_path, 'rb') as file:
content = file.read()
# 检测文件编码
detected_encoding = chardet.detect(content)['encoding']
# 解码文件内容
try:
text = content.decode(detected_encoding)
except UnicodeDecodeError:
print("Unable to decode the file with the detected encoding.")
return
cleaned_content = []
for char in text:
if re.match(r'[a-zA-Z0-9\u4e00-\u9fff]', char):
cleaned_content.append(char)
else:
if replace_char:
cleaned_content.append(replace_char)
cleaned_text = ''.join(cleaned_content)
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.write(cleaned_text)
# 示例用法
file_path = 'input.txt'
output_path = 'cleaned_file.txt'
clean_file(file_path, output_path)