txt重新编码 – 探索加拿大



import re
import chardet

def clean_file(file_path, output_path, replace_char=','):
    with open(file_path, 'rb') as file:
        content = file.read()

    # 检测文件编码
    detected_encoding = chardet.detect(content)['encoding']

    # 解码文件内容
    try:
        text = content.decode(detected_encoding)
    except UnicodeDecodeError:
        print("Unable to decode the file with the detected encoding.")
        return

    cleaned_content = []
    for char in text:
        if re.match(r'[a-zA-Z0-9\u4e00-\u9fff]', char):
            cleaned_content.append(char)
        else:
            if replace_char:
                cleaned_content.append(replace_char)

    cleaned_text = ''.join(cleaned_content)

    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(cleaned_text)

# 示例用法
file_path = 'input.txt'
output_path = 'cleaned_file.txt'
clean_file(file_path, output_path)

相关文章

带UI界面的PDF内容提取代码

天池tts

TTS网页版 未验证

TTS网页版未验证