This commit is contained in:
睿 安
2026-01-21 16:48:36 +08:00
commit abba5cb273
246 changed files with 57473 additions and 0 deletions

View File

View File

@@ -0,0 +1,176 @@
import csv
import html
import os
import shutil
import sys
import filecmp
from PyQt5.QtCore import pyqtSignal, QThread
from app.config import OUTPUT_DIR
from app.person import Me, Contact
os.makedirs(os.path.join(OUTPUT_DIR, '聊天记录'), exist_ok=True)
def set_global_font(doc, font_name):
# 创建一个新样式
style = doc.styles['Normal']
# 设置字体名称
style.font.name = font_name
# 遍历文档中的所有段落,将样式应用到每个段落
for paragraph in doc.paragraphs:
for run in paragraph.runs:
run.font.name = font_name
def makedirs(path):
os.makedirs(path, exist_ok=True)
os.makedirs(os.path.join(path, 'image'), exist_ok=True)
os.makedirs(os.path.join(path, 'emoji'), exist_ok=True)
os.makedirs(os.path.join(path, 'video'), exist_ok=True)
os.makedirs(os.path.join(path, 'voice'), exist_ok=True)
os.makedirs(os.path.join(path, 'file'), exist_ok=True)
os.makedirs(os.path.join(path, 'avatar'), exist_ok=True)
os.makedirs(os.path.join(path, 'music'), exist_ok=True)
os.makedirs(os.path.join(path, 'icon'), exist_ok=True)
resource_dir = os.path.join('app', 'resources', 'data', 'icons')
if not os.path.exists(resource_dir):
# 获取打包后的资源目录
resource_dir = getattr(sys, '_MEIPASS', os.path.abspath(os.path.dirname(__file__)))
# 构建 FFmpeg 可执行文件的路径
resource_dir = os.path.join(resource_dir, 'app', 'resources', 'data', 'icons')
target_folder = os.path.join(path, 'icon')
# 拷贝一些必备的图标
for root, dirs, files in os.walk(resource_dir):
relative_path = os.path.relpath(root, resource_dir)
target_path = os.path.join(target_folder, relative_path)
# 遍历文件夹中的文件
for file in files:
source_file_path = os.path.join(root, file)
target_file_path = os.path.join(target_path, file)
if not os.path.exists(target_file_path):
shutil.copy(source_file_path, target_file_path)
else:
# 比较文件内容
if not filecmp.cmp(source_file_path, target_file_path, shallow=False):
# 文件内容不一致,进行覆盖拷贝
shutil.copy(source_file_path, target_file_path)
def escape_js_and_html(input_str):
if not input_str:
return ''
# 转义HTML特殊字符
html_escaped = html.escape(input_str, quote=False)
# 手动处理JavaScript转义字符
js_escaped = (
html_escaped
.replace("\\", "\\\\")
.replace("'", r"\'")
.replace('"', r'\"')
.replace("\n", r'\n')
.replace("\r", r'\r')
.replace("\t", r'\t')
)
return js_escaped
class ExporterBase(QThread):
progressSignal = pyqtSignal(int)
rangeSignal = pyqtSignal(int)
okSignal = pyqtSignal(int)
i = 1
CSV = 0
DOCX = 1
HTML = 2
CSV_ALL = 3
CONTACT_CSV = 4
TXT = 5
def __init__(self, contact, type_=DOCX, message_types={}, time_range=None, messages=None,index=0, parent=None):
super().__init__(parent)
self.message_types = message_types # 导出的消息类型
self.contact: Contact = contact # 联系人
self.output_type = type_ # 导出文件类型
self.total_num = 1 # 总的消息数量
self.num = 0 # 当前处理的消息数量
self.index = index #
self.last_timestamp = 0
self.time_range = time_range
self.messages = messages
self.origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
makedirs(self.origin_path)
def run(self):
self.export()
def export(self):
raise NotImplementedError("export method must be implemented in subclasses")
def cancel(self):
self.requestInterruption()
def is_5_min(self, timestamp) -> bool:
if abs(timestamp - self.last_timestamp) > 300:
self.last_timestamp = timestamp
return True
return False
def get_avatar_path(self, is_send, message, is_absolute_path=False) -> str:
if is_absolute_path:
if self.contact.is_chatroom:
avatar = message[13].avatar_path
else:
avatar = Me().avatar_path if is_send else self.contact.avatar_path
else:
if self.contact.is_chatroom:
avatar = message[13].smallHeadImgUrl
else:
avatar = Me().smallHeadImgUrl if is_send else self.contact.smallHeadImgUrl
return avatar
def get_display_name(self, is_send, message) -> str:
if self.contact.is_chatroom:
if is_send:
display_name = Me().name
else:
display_name = message[13].remark
else:
display_name = Me().name if is_send else self.contact.remark
return escape_js_and_html(display_name)
def text(self, doc, message):
return
def image(self, doc, message):
return
def audio(self, doc, message):
return
def emoji(self, doc, message):
return
def file(self, doc, message):
return
def refermsg(self, doc, message):
return
def system_msg(self, doc, message):
return
def video(self, doc, message):
return
def music_share(self, doc, message):
return
def share_card(self, doc, message):
return

View File

@@ -0,0 +1,96 @@
import os
import re
from app.DataBase import msg_db
from app.util.compress_content import parser_reply, share_card
from app.util.exporter.exporter import ExporterBase
def remove_privacy_info(text):
# 正则表达式模式
patterns = {
'phone': r'\b(\+?86[-\s]?)?1[3-9]\d{9}\b', # 手机号
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # 邮箱
'id_card': r'\b\d{15}|\d{18}|\d{17}X\b', # 身份证号
'password': r'\b(?:password|pwd|pass|psw)[\s=:]*\S+\b', # 密码
'account': r'\b(?:account|username|user|acct)[\s=:]*\S+\b' # 账号
}
for key, pattern in patterns.items():
text = re.sub(pattern, f'[{key} xxx]', text)
return text
class AiTxtExporter(ExporterBase):
last_is_send = -1
def title(self, message):
str_time = message[8]
is_send = message[4]
display_name = ''
if is_send != self.last_is_send:
display_name = '\n' + self.get_display_name(is_send, message) + ':'
self.last_is_send = is_send
return display_name
def text(self, doc, message):
str_content = remove_privacy_info(message[7])
doc.write(
f'''{self.title(message)}{str_content} '''
)
def image(self, doc, message):
doc.write(
f'''{self.title(message)}[图片]'''
)
def audio(self, doc, message):
doc.write(
f'''{self.title(message)}[语音]'''
)
def emoji(self, doc, message):
doc.write(
f'''{self.title(message)}[表情包]'''
)
def file(self, doc, message):
doc.write(
f'''{self.title(message)}[文件]'''
)
def system_msg(self, doc, message):
str_content = message[7]
str_time = message[8]
str_content = str_content.replace('<![CDATA[', "").replace(
' <a href="weixin://revoke_edit_click">重新编辑</a>]]>', "")
doc.write(
f'''{str_time} {str_content}'''
)
def video(self, doc, message):
is_send = message[4]
doc.write(
f'''{self.title(message)}[视频]'''
)
def export(self):
# 实现导出为txt的逻辑
print(f"【开始导出 TXT {self.contact.remark}")
origin_path = self.origin_path
os.makedirs(origin_path, exist_ok=True)
filename = os.path.join(origin_path, self.contact.remark + '_chat.txt')
messages = msg_db.get_messages_group_by_day(self.contact.wxid, time_range=self.time_range)
total_steps = len(messages)
with open(filename, mode='w', newline='', encoding='utf-8') as f:
for date, messages in messages.items():
f.write(f"\n\n{'*' * 20}{date}{'*' * 20}\n")
for index, message in enumerate(messages):
type_ = message[2]
sub_type = message[3]
self.progressSignal.emit(int((index + 1) / total_steps * 100))
if type_ == 1 and self.message_types.get(type_):
self.text(f, message)
print(f"【完成导出 TXT {self.contact.remark}")
self.okSignal.emit(1)

View File

@@ -0,0 +1,40 @@
import csv
import os
from app.DataBase import msg_db
from app.person import Me
from app.util.exporter.exporter import ExporterBase
from app.config import OUTPUT_DIR
class CSVExporter(ExporterBase):
def to_csv(self):
print(f"【开始导出 CSV {self.contact.remark}")
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
os.makedirs(origin_path, exist_ok=True)
filename = os.path.join(origin_path,f"{self.contact.remark}_utf8.csv")
columns = ['localId', 'TalkerId', 'Type', 'SubType',
'IsSender', 'CreateTime', 'Status', 'StrContent',
'StrTime', 'Remark', 'NickName', 'Sender']
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
# 写入CSV文件
with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(columns)
# 写入数据
# writer.writerows(messages)
for msg in messages:
if self.contact.is_chatroom:
other_data = [msg[13].remark, msg[13].nickName, msg[13].wxid]
else:
is_send = msg[4]
Remark = Me().remark if is_send else self.contact.remark
nickname = Me().nickName if is_send else self.contact.nickName
wxid = Me().wxid if is_send else self.contact.wxid
other_data = [Remark,nickname,wxid]
writer.writerow([*msg[:9], *other_data])
print(f"【完成导出 CSV {self.contact.remark}")
self.okSignal.emit(1)
def run(self):
self.to_csv()

View File

@@ -0,0 +1,380 @@
import os
import shutil
import time
from re import findall
import docx
from docx import shared
from docx.enum.table import WD_ALIGN_VERTICAL
from docx.enum.text import WD_COLOR_INDEX, WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docxcompose.composer import Composer
from app.DataBase import msg_db, hard_link_db
from app.util.exporter.exporter import ExporterBase, escape_js_and_html
from app.config import OUTPUT_DIR
from app.log import logger
from app.person import Me
from app.util.compress_content import parser_reply, share_card, music_share
from app.util.image import get_image_abs_path
from app.util.music import get_music_path
# 要删除的编码字符
encoded_chars = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'
# 创建一个字典,将要删除的字符映射为 None
char_mapping = {char: None for char in encoded_chars}
def filter_control_characters(input_string):
"""
过滤掉不可打印字符
@param input_string:
@return:
"""
# 过滤掉非可打印字符
filtered_string = input_string.translate(char_mapping)
return filtered_string
class DocxExporter(ExporterBase):
def text(self, doc, message):
type_ = message[2]
str_content = message[7]
str_time = message[8]
is_send = message[4]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message, True)
content_cell = self.create_table(doc, is_send, avatar)
try:
content_cell.paragraphs[0].add_run(str_content)
except ValueError:
try:
str_content = filter_control_characters(str_content)
content_cell.paragraphs[0].add_run(str_content)
except ValueError:
logger.error(f'非法字符:{str_content}')
content_cell.paragraphs[0].add_run('非法字符')
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
# doc.add_picture(avatar)
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def image(self, doc, message):
str_content = message[7]
is_send = message[4]
BytesExtra = message[10]
avatar = self.get_avatar_path(is_send, message, True)
content = self.create_table(doc, is_send, avatar)
run = content.paragraphs[0].add_run()
str_content = escape_js_and_html(str_content)
image_path = hard_link_db.get_image(str_content, BytesExtra, thumb=True)
base_path = os.path.join(OUTPUT_DIR, '聊天记录', self.contact.remark, 'image')
if not os.path.exists(os.path.join(Me().wx_dir, image_path)):
image_thumb_path = hard_link_db.get_image(str_content, BytesExtra, thumb=False)
if not os.path.exists(os.path.join(Me().wx_dir, image_thumb_path)):
return
image_path = image_thumb_path
image_path = get_image_abs_path(image_path, base_path=base_path)
try:
run.add_picture(image_path, height=shared.Inches(2))
doc.add_paragraph()
except Exception:
print("Error!image")
def audio(self, doc, message):
str_content = message[7]
str_time = message[8]
is_send = message[4]
msgSvrId = message[9]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message, True)
content_cell = self.create_table(doc, is_send, avatar)
content_cell.paragraphs[0].add_run('【语音】')
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def emoji(self, doc, message):
str_content = message[7]
str_time = message[8]
is_send = message[4]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message, True)
content_cell = self.create_table(doc, is_send, avatar)
content_cell.paragraphs[0].add_run('【表情包】')
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def file(self, doc, message):
bytesExtra = message[10]
str_time = message[8]
is_send = message[4]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message, True)
content_cell = self.create_table(doc, is_send, avatar)
content_cell.paragraphs[0].add_run('【文件】')
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def refermsg(self, doc, message):
"""
处理回复消息
@param doc:
@param message:
@return:
"""
str_time = message[8]
is_send = message[4]
content = parser_reply(message[11])
refer_msg = content.get('refer')
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message, True)
content_cell = self.create_table(doc, is_send, avatar)
content_cell.paragraphs[0].add_run(content.get('title'))
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
reply_p = content_cell.add_paragraph()
reply_content = f"{refer_msg.get('displayname')}:{refer_msg.get('content')}" if refer_msg else '未知引用'
run = content_cell.paragraphs[1].add_run(reply_content)
'''设置被回复内容格式'''
run.font.color.rgb = shared.RGBColor(121, 121, 121)
run.font_size = shared.Inches(0.3)
run.font.highlight_color = WD_COLOR_INDEX.GRAY_25
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
reply_p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def system_msg(self, doc, message):
str_content = message[7]
is_send = message[4]
str_time = message[8]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
str_content = str_content.replace('<![CDATA[', "").replace(
' <a href="weixin://revoke_edit_click">重新编辑</a>]]>', "")
res = findall('(</{0,1}(img|revo|_wc_cus|a).*?>)', str_content)
for xmlstr, b in res:
str_content = str_content.replace(xmlstr, "")
doc.add_paragraph(str_content).alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
def video(self, doc, message):
type_ = message[2]
str_content = message[7]
str_time = message[8]
is_send = message[4]
BytesExtra = message[10]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message, True)
content_cell = self.create_table(doc, is_send, avatar)
content_cell.paragraphs[0].add_run('【视频】')
content_cell.paragraphs[0].font_size = shared.Inches(0.5)
if is_send:
p = content_cell.paragraphs[0]
p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
doc.add_paragraph()
def create_table(self, doc, is_send, avatar_path):
'''
#! 创建一个1*2表格
#! isSend = 1 (0,0)存聊天内容,(0,1)存头像
#! isSend = 0 (0,0)存头像,(0,1)存聊天内容
#! 返回聊天内容的坐标
'''
table = doc.add_table(rows=1, cols=2, style='Normal Table')
table.cell(0, 1).height = shared.Inches(0.5)
table.cell(0, 0).height = shared.Inches(0.5)
if is_send:
'''表格右对齐'''
table.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
avatar = table.cell(0, 1).paragraphs[0].add_run()
'''插入头像,设置头像宽度'''
avatar.add_picture(avatar_path, width=shared.Inches(0.5))
'''设置单元格宽度跟头像一致'''
table.cell(0, 1).width = shared.Inches(0.5)
content_cell = table.cell(0, 0)
'''聊天内容右对齐'''
content_cell.paragraphs[0].paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
else:
avatar = table.cell(0, 0).paragraphs[0].add_run()
avatar.add_picture(avatar_path, width=shared.Inches(0.5))
'''设置单元格宽度'''
table.cell(0, 0).width = shared.Inches(0.5)
content_cell = table.cell(0, 1)
'''聊天内容垂直居中对齐'''
content_cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
return content_cell
def music_share(self, doc, message):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
is_send = message[4]
timestamp = message[5]
content = music_share(message[11])
music_path = ''
if content.get('audio_url') != '':
music_path = get_music_path(content.get('audio_url'), content.get('title'),
output_path=origin_path + '/music')
if music_path != '':
music_path = f'./music/{os.path.basename(music_path)}'
music_path = music_path.replace('\\', '/')
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
def share_card(self, doc, message):
origin_path = f"{os.getcwd()}/data/聊天记录/{self.contact.remark}"
is_send = message[4]
timestamp = message[5]
bytesExtra = message[10]
compress_content_ = message[11]
card_data = share_card(bytesExtra, compress_content_)
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
thumbnail = ''
if card_data.get('thumbnail'):
thumbnail = os.path.join(Me().wx_dir, card_data.get('thumbnail'))
if os.path.exists(thumbnail):
shutil.copy(thumbnail, os.path.join(origin_path, 'image', os.path.basename(thumbnail)))
thumbnail = './image/' + os.path.basename(thumbnail)
else:
thumbnail = ''
app_logo = ''
if card_data.get('app_logo'):
app_logo = os.path.join(Me().wx_dir, card_data.get('app_logo'))
if os.path.exists(app_logo):
shutil.copy(app_logo, os.path.join(origin_path, 'image', os.path.basename(app_logo)))
app_logo = './image/' + os.path.basename(app_logo)
else:
app_logo = ''
def merge_docx(self, conRemark, n):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录')
all_file_path = []
for i in range(n):
file_name = f"{conRemark}{i}.docx"
all_file_path.append(origin_path + '/' + file_name)
filename = f"{conRemark}.docx"
# print(all_file_path)
doc = docx.Document()
doc.save(origin_path + '/' + filename)
master = docx.Document(origin_path + '/' + filename)
middle_new_docx = Composer(master)
num = 0
for word in all_file_path:
word_document = docx.Document(word)
word_document.add_page_break()
if num != 0:
middle_new_docx.append(word_document)
num = num + 1
os.remove(word)
middle_new_docx.save(origin_path + '/' + filename)
def export(self):
print(f"【开始导出 DOCX {self.contact.remark}")
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
Me().save_avatar(os.path.join(origin_path, 'avatar', f'{Me().wxid}.png'))
if self.contact.is_chatroom:
for message in messages:
if message[4]: # is_send
continue
try:
chatroom_avatar_path =os.path.join(origin_path, 'avatar', f'{message[13].wxid}.png')
message[13].save_avatar(chatroom_avatar_path)
except:
print(message)
pass
else:
self.contact.save_avatar(os.path.join(origin_path, 'avatar', f'{self.contact.wxid}.png'))
self.rangeSignal.emit(len(messages))
def newdoc():
nonlocal n, doc
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
n += 1
doc = None
n = 0
index = 0
newdoc()
for index, message in enumerate(messages):
if index % 200 == 0 and index:
filename = os.path.join(origin_path, f"{self.contact.remark}_{n}.docx")
doc.save(filename)
self.okSignal.emit(n)
newdoc()
type_ = message[2]
sub_type = message[3]
timestamp = message[5]
self.progressSignal.emit(1)
if self.is_5_min(timestamp):
str_time = message[8]
doc.add_paragraph(str_time).alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
if type_ == 1 and self.message_types.get(type_):
self.text(doc, message)
elif type_ == 3 and self.message_types.get(type_):
self.image(doc, message)
elif type_ == 34 and self.message_types.get(type_):
self.audio(doc, message)
elif type_ == 43 and self.message_types.get(type_):
self.video(doc, message)
elif type_ == 47 and self.message_types.get(type_):
self.emoji(doc, message)
elif type_ == 10000 and self.message_types.get(type_):
self.system_msg(doc, message)
elif type_ == 49 and sub_type == 57 and self.message_types.get(1):
self.refermsg(doc, message)
elif type_ == 49 and sub_type == 6 and self.message_types.get(4906):
self.file(doc, message)
if index % 25 == 0:
print(f"【导出 DOCX {self.contact.remark}{index}/{len(messages)}")
if index % 25:
print(f"【导出 DOCX {self.contact.remark}{index + 1}/{len(messages)}")
filename = os.path.join(origin_path, f"{self.contact.remark}_{n}.docx")
try:
# document.save(filename)
doc.save(filename)
except PermissionError:
filename = filename[:-5] + f'{time.time()}' + '.docx'
# document.save(filename)
doc.save(filename)
self.okSignal.emit(n)
print(f"【完成导出 DOCX {self.contact.remark}")
self.okSignal.emit(10086)

View File

@@ -0,0 +1,523 @@
import os
import shutil
import sys
import traceback
from re import findall
from PyQt5.QtCore import pyqtSignal, QThread
from app.DataBase import msg_db, hard_link_db, media_msg_db
from app.util.exporter.exporter import ExporterBase, escape_js_and_html
from app.config import OUTPUT_DIR
from app.log import logger
from app.person import Me
from app.util import path
from app.util.compress_content import parser_reply, share_card, music_share, file, transfer_decompress, call_decompress
from app.util.emoji import get_emoji_url
from app.util.image import get_image_path, get_image
from app.util.music import get_music_path
icon_files = {
'./icon/word.png': ['doc', 'docx'],
'./icon/excel.png': ['xls', 'xlsx'],
'./icon/csv.png': ['csv'],
'./icon/txt.png': ['txt'],
'./icon/zip.png': ['zip', '7z', 'rar'],
'./icon/ppt.png': ['ppt', 'pptx'],
'./icon/pdf.png': ['pdf'],
}
class HtmlExporter(ExporterBase):
def text(self, doc, message):
type_ = message[2]
str_content = message[7]
str_time = message[8]
is_send = message[4]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
display_name = self.get_display_name(is_send, message)
avatar = self.get_avatar_path(is_send, message)
str_content = escape_js_and_html(str_content)
doc.write(
f'''{{ type:{1}, text: '{str_content}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
def image(self, doc, message):
base_path = os.path.join(OUTPUT_DIR, '聊天记录', self.contact.remark, 'image')
type_ = message[2]
str_content = message[7]
str_time = message[8]
is_send = message[4]
BytesExtra = message[10]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
str_content = escape_js_and_html(str_content)
image_path = hard_link_db.get_image(str_content, BytesExtra, up_dir=Me().wx_dir, thumb=False)
image_path = get_image_path(image_path, base_path=base_path)
doc.write(
f'''{{ type:{type_}, text: '{image_path}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
def audio(self, doc, message):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
str_content = message[7]
str_time = message[8]
is_send = message[4]
msgSvrId = message[9]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
try:
audio_path = media_msg_db.get_audio_path(msgSvrId, output_path=origin_path + "/voice")
audio_path = "./voice/" + os.path.basename(audio_path)
except:
logger.error(traceback.format_exc())
return
voice_to_text = media_msg_db.get_audio_text(str_content)
if voice_to_text and voice_to_text != "":
voice_to_text = escape_js_and_html(voice_to_text)
doc.write(
f'''{{ type:34, text:'{audio_path}',is_send:{is_send},avatar_path:'{avatar}',voice_to_text:'{voice_to_text}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
def emoji(self, doc, message):
str_content = message[7]
str_time = message[8]
is_send = message[4]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
emoji_path = get_emoji_url(str_content, thumb=True)
doc.write(
f'''{{ type:{3}, text: '{emoji_path}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
def file(self, doc, message):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
bytesExtra = message[10]
compress_content = message[11]
str_time = message[8]
is_send = message[4]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
file_info = file(bytesExtra, compress_content, output_path=origin_path + '/file')
if file_info.get('is_error') == False:
icon_path = None
for icon, extensions in icon_files.items():
if file_info.get('file_ext') in extensions:
icon_path = icon
break
# 如果没有与文件后缀匹配的图标,则使用默认图标
if icon_path is None:
default_icon = './icon/file.png'
icon_path = default_icon
file_path = file_info.get('file_path')
if file_path != "":
file_path = './file/' + file_info.get('file_name')
doc.write(
f'''{{ type:49, text: '{file_path}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}',icon_path: '{icon_path}',sub_type:6,file_name: '{file_info.get('file_name')}',file_size: '{file_info.get('file_len')}',app_name: '{file_info.get('app_name')}'}},'''
)
def refermsg(self, doc, message):
"""
处理回复消息
@param doc:
@param message:
@return:
"""
str_time = message[8]
is_send = message[4]
content = parser_reply(message[11])
refer_msg = content.get('refer')
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
contentText = escape_js_and_html(content.get('title'))
if refer_msg:
referText = f"{escape_js_and_html(refer_msg.get('displayname'))}{escape_js_and_html(refer_msg.get('content'))}"
doc.write(
f'''{{ type:49, text: '{contentText}',is_send:{is_send},sub_type:{content.get('type')},refer_text: '{referText}',avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
else:
doc.write(
f'''{{ type:49, text: '{contentText}',is_send:{is_send},sub_type:{content.get('type')},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
def system_msg(self, doc, message):
str_content = message[7]
is_send = message[4]
str_time = message[8]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
str_content = str_content.replace('<![CDATA[', "").replace(
' <a href="weixin://revoke_edit_click">重新编辑</a>]]>', "")
res = findall('(</{0,1}(img|revo|_wc_cus|a).*?>)', str_content)
for xmlstr, b in res:
str_content = str_content.replace(xmlstr, "")
str_content = escape_js_and_html(str_content)
doc.write(
f'''{{ type:0, text: '{str_content}',is_send:{is_send},avatar_path:'',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:''}},'''
)
def video(self, doc, message):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
type_ = message[2]
str_content = message[7]
str_time = message[8]
is_send = message[4]
BytesExtra = message[10]
timestamp = message[5]
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
video_path = hard_link_db.get_video(str_content, BytesExtra, thumb=False)
image_path = hard_link_db.get_video(str_content, BytesExtra, thumb=True)
if video_path is None and image_path is not None:
image_path = path.get_relative_path(image_path, base_path=f'/data/聊天记录/{self.contact.remark}/image')
try:
# todo 网络图片问题
print(origin_path + image_path[1:])
os.utime(origin_path + image_path[1:], (timestamp, timestamp))
doc.write(
f'''{{ type:3, text: '{image_path}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
except:
doc.write(
f'''{{ type:1, text: '视频丢失',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
return
if video_path is None and image_path is None:
return
video_path = f'{Me().wx_dir}/{video_path}'
video_path = video_path.replace('\\', '/')
if os.path.exists(video_path):
new_path = origin_path + '/video/' + os.path.basename(video_path)
if not os.path.exists(new_path):
shutil.copy(video_path, os.path.join(origin_path, 'video'))
os.utime(new_path, (timestamp, timestamp))
video_path = f'./video/{os.path.basename(video_path)}'
doc.write(
f'''{{ type:{type_}, text: '{video_path}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}'}},'''
)
def music_share(self, doc, message):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
is_send = message[4]
timestamp = message[5]
content = music_share(message[11])
music_path = ''
if content.get('is_error') == False:
if content.get('audio_url') != '':
music_path = get_music_path(content.get('audio_url'), content.get('title'),
output_path=origin_path + '/music')
if music_path != '':
music_path = f'./music/{os.path.basename(music_path)}'
music_path = music_path.replace('\\', '/')
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
music_path = escape_js_and_html(music_path)
doc.write(
f'''{{ type:49, text:'{music_path}',is_send:{is_send},avatar_path:'{avatar}',link_url:'{content.get('link_url')}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}',sub_type:3,title:'{content.get('title')}',artist:'{content.get('artist')}', website_name:'{content.get('website_name')}'}},'''
)
def share_card(self, doc, message):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
is_send = message[4]
timestamp = message[5]
bytesExtra = message[10]
compress_content_ = message[11]
card_data = share_card(bytesExtra, compress_content_)
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
thumbnail = ''
if card_data.get('thumbnail'):
thumbnail = os.path.join(Me().wx_dir, card_data.get('thumbnail'))
if os.path.exists(thumbnail):
shutil.copy(thumbnail, os.path.join(origin_path, 'image', os.path.basename(thumbnail)))
thumbnail = './image/' + os.path.basename(thumbnail)
else:
thumbnail = ''
app_logo = ''
if card_data.get('app_logo'):
app_logo = os.path.join(Me().wx_dir, card_data.get('app_logo'))
if os.path.exists(app_logo):
shutil.copy(app_logo, os.path.join(origin_path, 'image', os.path.basename(app_logo)))
app_logo = './image/' + os.path.basename(app_logo)
else:
app_logo = card_data.get('app_logo')
doc.write(
f'''{{ type:49,sub_type:5, text:'',is_send:{is_send},avatar_path:'{avatar}',url:'{card_data.get('url')}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}',title:'{card_data.get('title')}',description:'{card_data.get('description')}',thumbnail:'{thumbnail}',app_logo:'{app_logo}',app_name:'{card_data.get('app_name')}'}},\n'''
)
def transfer(self, doc, message):
is_send = message[4]
timestamp = message[5]
compress_content_ = message[11]
# open("test.bin", "wb").write(compress_content_)
transfer_detail = transfer_decompress(compress_content_)
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
try:
text_info_map = {
1: transfer_detail["pay_memo"] or "发起转账",
3: "已收款",
4: "已退还",
5: "非实时转账收款",
7: "发起非实时转账",
8: "未知",
9: "未知",
}
doc.write(
f"""{{ type:49,sub_type:2000,text:'{text_info_map[transfer_detail["paysubtype"]]}',is_send:{is_send},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}',paysubtype:{transfer_detail["paysubtype"]},pay_memo:'{transfer_detail["pay_memo"]}',feedesc:'{transfer_detail["feedesc"]}',}},\n""")
except Exception as e:
logger.error(f'转账解析错误:{transfer_detail}\n{traceback.format_exc()}')
def call(self, doc, message):
is_send = message[4]
timestamp = message[5]
str_content = message[7]
bytes_extra = message[10]
display_content = message[12]
call_detail = call_decompress(
is_send, bytes_extra, display_content, str_content
)
is_chatroom = 1 if self.contact.is_chatroom else 0
avatar = self.get_avatar_path(is_send, message)
display_name = self.get_display_name(is_send, message)
doc.write(
f"""{{ type:50, text:'{call_detail["display_content"]}',call_type:{call_detail["call_type"]},avatar_path:'{avatar}',timestamp:{timestamp},is_chatroom:{is_chatroom},displayname:'{display_name}',}},\n""")
def export(self):
print(f"【开始导出 HTML {self.contact.remark}")
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
filename = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark,
f'{self.contact.remark}.html')
file_path = './app/resources/data/template.html'
if not os.path.exists(file_path):
resource_dir = getattr(sys, '_MEIPASS', os.path.abspath(os.path.dirname(__file__)))
file_path = os.path.join(resource_dir, 'app', 'resources', 'data', 'template.html')
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
html_head, html_end = content.split('/*注意看这是分割线*/')
f = open(filename, 'w', encoding='utf-8')
html_head = html_head.replace("<title>出错了</title>", f"<title>{self.contact.remark}</title>")
html_head = html_head.replace("<p id=\"title\">出错了</p>", f"<p id=\"title\">{self.contact.remark}</p>")
f.write(html_head)
self.rangeSignal.emit(len(messages))
for index, message in enumerate(messages):
type_ = message[2]
sub_type = message[3]
timestamp = message[5]
if (type_ == 3 and self.message_types.get(3)) or (type_ == 34 and self.message_types.get(34)) or (
type_ == 47 and self.message_types.get(47)):
pass
else:
self.progressSignal.emit(1)
if type_ == 1 and self.message_types.get(type_):
self.text(f, message)
elif type_ == 3 and self.message_types.get(type_):
self.image(f, message)
elif type_ == 34 and self.message_types.get(type_):
self.audio(f, message)
elif type_ == 43 and self.message_types.get(type_):
self.video(f, message)
elif type_ == 47 and self.message_types.get(type_):
self.emoji(f, message)
elif type_ == 10000 and self.message_types.get(type_):
self.system_msg(f, message)
elif type_ == 49 and sub_type == 57 and self.message_types.get(1):
self.refermsg(f, message)
elif type_ == 49 and sub_type == 6 and self.message_types.get(4906):
self.file(f, message)
elif type_ == 49 and sub_type == 3 and self.message_types.get(4903):
self.music_share(f, message)
elif type_ == 49 and sub_type == 5 and self.message_types.get(4905):
self.share_card(f, message)
elif type_ == 49 and sub_type == 2000 and self.message_types.get(492000):
self.transfer(f, message)
elif type_ == 50 and self.message_types.get(50):
self.call(f, message)
if index % 2000 == 0:
print(f"【导出 HTML {self.contact.remark}{index}/{len(messages)}")
f.write(html_end)
f.close()
print(f"【完成导出 HTML {self.contact.remark}{len(messages)}")
self.count_finish_num(1)
def count_finish_num(self, num):
"""
记录子线程完成个数
@param num:
@return:
"""
self.num += 1
print("子线程完成", self.num, "/", self.total_num)
if self.num == self.total_num:
# 所有子线程都完成之后就发送完成信号
self.okSignal.emit(1)
class OutputMedia(QThread):
"""
导出语音消息
"""
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact):
super().__init__()
self.contact = contact
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages_by_type(self.contact.wxid, 34)
for message in messages:
is_send = message[4]
msgSvrId = message[9]
try:
audio_path = media_msg_db.get_audio(
msgSvrId, output_path=origin_path + "/voice"
)
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(34)
class OutputEmoji(QThread):
"""
导出表情包
"""
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact):
super().__init__()
self.contact = contact
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages_by_type(self.contact.wxid, 47)
for message in messages:
str_content = message[7]
try:
pass
# emoji_path = get_emoji(str_content, thumb=True, output_path=origin_path + '/emoji')
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(47)
class OutputImage(QThread):
"""
导出图片
"""
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact):
super().__init__()
self.contact = contact
self.child_thread_num = 2
self.child_threads = [0] * (self.child_thread_num + 1)
self.num = 0
def count1(self, num):
self.num += 1
print("图片导出完成一个")
if self.num == self.child_thread_num:
self.okSingal.emit(47)
print("图片导出完成")
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages_by_type(self.contact.wxid, 3)
base_path = os.path.join(OUTPUT_DIR, '聊天记录', self.contact.remark, 'image')
for message in messages:
str_content = message[7]
BytesExtra = message[10]
timestamp = message[5]
try:
image_path = hard_link_db.get_image(
str_content, BytesExtra, thumb=False
)
if not os.path.exists(os.path.join(Me().wx_dir, image_path)):
image_thumb_path = hard_link_db.get_image(
str_content, BytesExtra, thumb=True
)
if not os.path.exists(os.path.join(Me().wx_dir, image_thumb_path)):
continue
image_path = image_thumb_path
image_path = get_image(
image_path, base_path=base_path
)
try:
os.utime(origin_path + image_path[1:], (timestamp, timestamp))
except:
pass
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(47)
class OutputImageChild(QThread):
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact, messages):
super().__init__()
self.contact = contact
self.messages = messages
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
for message in self.messages:
str_content = message[7]
BytesExtra = message[10]
timestamp = message[5]
try:
image_path = hard_link_db.get_image(
str_content, BytesExtra, thumb=False
)
if not os.path.exists(os.path.join(Me().wx_dir, image_path)):
image_thumb_path = hard_link_db.get_image(
str_content, BytesExtra, thumb=True
)
if not os.path.exists(os.path.join(Me().wx_dir, image_thumb_path)):
continue
image_path = image_thumb_path
image_path = get_image(
image_path, base_path=f"/data/聊天记录/{self.contact.remark}/image"
)
try:
os.utime(origin_path + image_path[1:], (timestamp, timestamp))
except:
pass
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(47)
print("图片子线程完成")

View File

@@ -0,0 +1,193 @@
import json
import random
import os
from app.DataBase import msg_db
from app.person import Me
from .exporter import ExporterBase
def merge_content(conversions_list) -> list:
"""
合并一组对话中连续发送的句子
@param conversions_list:
@return:
"""
merged_data = []
current_role = None
current_content = ""
str_time = ''
for item in conversions_list:
if 'str_time' in item:
str_time = item['str_time']
else:
str_time = ''
if current_role is None:
current_role = item["role"]
current_content = item["content"]
elif current_role == item["role"]:
current_content += "\n" + item["content"]
else:
# merged_data.append({"role": current_role, "content": current_content, 'str_time': str_time})
merged_data.append({"role": current_role, "content": current_content})
current_role = item["role"]
current_content = item["content"]
str_time = item.get('str_time')
# 处理最后一组
if current_role is not None:
# merged_data.append({"role": current_role, "content": current_content,'str_time': str_time})
merged_data.append({"role": current_role, "content": current_content})
return merged_data
def system_prompt():
system = {
"role": "system",
# "content": f"你是{Me().name},一个聪明、热情、善良的男大学生,后面的对话来自{self.contact.remark}(!!!注意:对方的身份十分重要,你务必记住对方的身份,因为跟不同的人对话要用不同的态度、语气),你要认真地回答他"
"content": f"你是{Me().name},一个聪明、热情、善良的人,后面的对话来自你的朋友,你要认真地回答他"
}
return system
def message_to_conversion(group):
conversions = [system_prompt()]
while len(group) and group[-1][4] == 0:
group.pop()
for message in group:
is_send = message[4]
if len(conversions) == 1 and is_send:
continue
if is_send:
json_msg = {
"role": "assistant",
"content": message[7]
}
else:
json_msg = {
"role": "user",
"content": message[7]
}
json_msg['str_time'] = message[8]
conversions.append(json_msg)
if len(conversions) == 1:
return []
return merge_content(conversions)
class JsonExporter(ExporterBase):
def split_by_time(self, length=300):
messages = msg_db.get_messages_by_type(self.contact.wxid, type_=1, time_range=self.time_range)
start_time = 0
res = []
i = 0
while i < len(messages):
message = messages[i]
timestamp = message[5]
is_send = message[4]
group = [
system_prompt()
]
while i < len(messages) and timestamp - start_time < length:
if is_send:
json_msg = {
"role": "assistant",
"content": message[7]
}
else:
json_msg = {
"role": "user",
"content": message[7]
}
group.append(json_msg)
i += 1
if i >= len(messages):
break
message = messages[i]
timestamp = message[5]
is_send = message[4]
while is_send:
json_msg = {
"role": "assistant",
"content": message[7]
}
group.append(json_msg)
i += 1
if i >= len(messages):
break
message = messages[i]
timestamp = message[5]
is_send = message[4]
start_time = timestamp
res.append(
{
"conversations": group
}
)
res_ = []
for item in res:
conversations = item['conversations']
res_.append({
'conversations': merge_content(conversations)
})
return res_
def split_by_intervals(self, max_diff_seconds=300):
messages = msg_db.get_messages_by_type(self.contact.wxid, type_=1, time_range=self.time_range)
res = []
i = 0
current_group = []
while i < len(messages):
message = messages[i]
timestamp = message[5]
is_send = message[4]
while is_send and i + 1 < len(messages):
i += 1
message = messages[i]
is_send = message[4]
current_group = [messages[i]]
i += 1
while i < len(messages) and messages[i][5] - current_group[-1][5] <= max_diff_seconds:
current_group.append(messages[i])
i += 1
while i < len(messages) and messages[i][4]:
current_group.append(messages[i])
i += 1
res.append(current_group)
res_ = []
for group in res:
conversations = message_to_conversion(group)
if conversations:
res_.append({
'conversations': conversations
})
return res_
def to_json(self):
print(f"【开始导出 json {self.contact.remark}")
origin_path = self.origin_path
os.makedirs(origin_path, exist_ok=True)
filename = os.path.join(origin_path, f"{self.contact.remark}")
# res = self.split_by_time()
res = self.split_by_intervals(60)
# 打乱列表顺序
random.shuffle(res)
# 计算切分比例
split_ratio = 0.2 # 20% for the second list
# 计算切分点
split_point = int(len(res) * split_ratio)
# 分割列表
train_data = res[split_point:]
dev_data = res[:split_point]
with open(f'{filename}_train.json', "w", encoding="utf-8") as f:
json.dump(train_data, f, ensure_ascii=False, indent=4)
with open(f'{filename}_dev.json', "w", encoding="utf-8") as f:
json.dump(dev_data, f, ensure_ascii=False, indent=4)
self.okSignal.emit(1)
def run(self):
self.to_json()

View File

@@ -0,0 +1,146 @@
import os
from app.DataBase import msg_db
from app.util.exporter.exporter import ExporterBase
from app.config import OUTPUT_DIR
from app.util.compress_content import parser_reply, share_card
class TxtExporter(ExporterBase):
def text(self, doc, message):
str_content = message[7]
str_time = message[8]
is_send = message[4]
display_name = self.get_display_name(is_send, message)
name = display_name
doc.write(
f'''{str_time} {name}\n{str_content}\n\n'''
)
def image(self, doc, message):
str_time = message[8]
is_send = message[4]
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}\n[图片]\n\n'''
)
def audio(self, doc, message):
str_time = message[8]
is_send = message[4]
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}\n[语音]\n\n'''
)
def emoji(self, doc, message):
str_time = message[8]
is_send = message[4]
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}\n[表情包]\n\n'''
)
def file(self, doc, message):
str_time = message[8]
is_send = message[4]
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}\n[文件]\n\n'''
)
def refermsg(self, doc, message):
"""
处理回复消息
@param doc:
@param message:
@return:
"""
str_time = message[8]
is_send = message[4]
content = parser_reply(message[11])
refer_msg = content.get('refer')
display_name = self.get_display_name(is_send, message)
if refer_msg:
doc.write(
f'''{str_time} {display_name}\n{content.get('title')}\n引用:{refer_msg.get('displayname')}:{refer_msg.get('content')}\n\n'''
)
else:
doc.write(
f'''{str_time} {display_name}\n{content.get('title')}\n引用:未知\n\n'''
)
def system_msg(self, doc, message):
str_content = message[7]
str_time = message[8]
str_content = str_content.replace('<![CDATA[', "").replace(
' <a href="weixin://revoke_edit_click">重新编辑</a>]]>', "")
doc.write(
f'''{str_time} {str_content}\n\n'''
)
def video(self, doc, message):
str_time = message[8]
is_send = message[4]
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}\n[视频]\n\n'''
)
def music_share(self, doc, message):
is_send = message[4]
str_time = message[8]
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}\n[音乐分享]\n\n'''
)
def share_card(self, doc, message):
is_send = message[4]
bytesExtra = message[10]
compress_content_ = message[11]
str_time = message[8]
card_data = share_card(bytesExtra, compress_content_)
display_name = self.get_display_name(is_send, message)
doc.write(
f'''{str_time} {display_name}
[链接]:title:{card_data.get('title')}
description:{card_data.get('description')}
url:{card_data.get('url')}
name:{card_data.get('app_name')}
\n\n'''
)
def export(self):
# 实现导出为txt的逻辑
print(f"【开始导出 TXT {self.contact.remark}")
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
os.makedirs(origin_path, exist_ok=True)
filename = os.path.join(origin_path, self.contact.remark+'.txt')
messages = msg_db.get_messages(self.contact.wxid, time_range=self.time_range)
total_steps = len(messages)
with open(filename, mode='w', newline='', encoding='utf-8') as f:
for index, message in enumerate(messages):
type_ = message[2]
sub_type = message[3]
self.progressSignal.emit(int((index + 1) / total_steps * 100))
if type_ == 1 and self.message_types.get(type_):
self.text(f, message)
elif type_ == 3 and self.message_types.get(type_):
self.image(f, message)
elif type_ == 34 and self.message_types.get(type_):
self.audio(f, message)
elif type_ == 43 and self.message_types.get(type_):
self.video(f, message)
elif type_ == 47 and self.message_types.get(type_):
self.emoji(f, message)
elif type_ == 10000 and self.message_types.get(type_):
self.system_msg(f, message)
elif type_ == 49 and sub_type == 57 and self.message_types.get(1):
self.refermsg(f, message)
elif type_ == 49 and sub_type == 6 and self.message_types.get(4906):
self.file(f, message)
elif type_ == 49 and sub_type == 3 and self.message_types.get(4903):
self.music_share(f, message)
elif type_ == 49 and sub_type == 5 and self.message_types.get(4905):
self.share_card(f, message)
print(f"【完成导出 TXT {self.contact.remark}")
self.okSignal.emit(1)

466
app/util/exporter/output.py Normal file
View File

@@ -0,0 +1,466 @@
import csv
import os
import time
import traceback
from typing import List
import docx
from PyQt5.QtCore import pyqtSignal, QThread
from PyQt5.QtWidgets import QFileDialog
from docx.oxml.ns import qn
from docxcompose.composer import Composer
from app.util.exporter.exporter_ai_txt import AiTxtExporter
from app.util.exporter.exporter_csv import CSVExporter
from app.util.exporter.exporter_docx import DocxExporter
from app.util.exporter.exporter_html import HtmlExporter
from app.util.exporter.exporter_json import JsonExporter
from app.util.exporter.exporter_txt import TxtExporter
from app.DataBase.hard_link import decodeExtraBuf
from app.config import OUTPUT_DIR
from app.DataBase.package_msg import PackageMsg
from app.DataBase import media_msg_db, hard_link_db, micro_msg_db, msg_db
from app.log import logger
from app.person import Me
from app.util.image import get_image
os.makedirs(os.path.join(OUTPUT_DIR, '聊天记录'), exist_ok=True)
class Output(QThread):
"""
发送信息线程
"""
startSignal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
rangeSignal = pyqtSignal(int)
okSignal = pyqtSignal(int)
batchOkSignal = pyqtSignal(int)
nowContact = pyqtSignal(str)
i = 1
CSV = 0
DOCX = 1
HTML = 2
CSV_ALL = 3
CONTACT_CSV = 4
TXT = 5
JSON = 6
AI_TXT = 7
Batch = 10086
def __init__(self, contact, type_=DOCX, message_types={}, sub_type=[], time_range=None, parent=None):
super().__init__(parent)
self.children = []
self.last_timestamp = 0
self.sub_type = sub_type
self.time_range = time_range
self.message_types = message_types
self.sec = 2 # 默认1000秒
self.contact = contact
self.msg_id = 0
self.output_type: int | List[int] = type_
self.total_num = 1
self.num = 0
def progress(self, value):
self.progressSignal.emit(value)
def output_image(self):
"""
导出全部图片
@return:
"""
return
def output_emoji(self):
"""
导出全部表情包
@return:
"""
return
def to_csv_all(self):
"""
导出全部聊天记录到CSV
@return:
"""
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录')
os.makedirs(origin_path, exist_ok=True)
filename = QFileDialog.getSaveFileName(None, "save file", os.path.join(os.getcwd(), 'messages.csv'),
"csv files (*.csv);;all files(*.*)")
if not filename[0]:
return
self.startSignal.emit(1)
filename = filename[0]
# columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
columns = ['localId', 'TalkerId', 'Type', 'SubType',
'IsSender', 'CreateTime', 'Status', 'StrContent',
'StrTime', 'Remark', 'NickName', 'Sender']
packagemsg = PackageMsg()
messages = packagemsg.get_package_message_all()
# 写入CSV文件
with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(columns)
# 写入数据
writer.writerows(messages)
self.okSignal.emit(1)
def contact_to_csv(self):
"""
导出联系人到CSV
@return:
"""
filename = QFileDialog.getSaveFileName(None, "save file", os.path.join(os.getcwd(), 'contacts.csv'),
"csv files (*.csv);;all files(*.*)")
if not filename[0]:
return
self.startSignal.emit(1)
filename = filename[0]
# columns = ["用户名", "消息内容", "发送时间", "发送状态", "消息类型", "isSend", "msgId"]
columns = ['UserName', 'Alias', 'Type', 'Remark', 'NickName', 'PYInitial', 'RemarkPYInitial', 'smallHeadImgUrl',
'bigHeadImgUrl', 'label', 'gender', 'telephone', 'signature', 'country/region', 'province', 'city']
contacts = micro_msg_db.get_contact()
# 写入CSV文件
with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(columns)
# 写入数据
# writer.writerows(contacts)
for contact in contacts:
detail = decodeExtraBuf(contact[9])
gender_code = detail.get('gender')
if gender_code == 0:
gender = '未知'
elif gender_code == 1:
gender = ''
else:
gender = ''
writer.writerow([*contact[:9], contact[10], gender, detail.get('telephone'), detail.get('signature'),
*detail.get('region')])
self.okSignal.emit(1)
def batch_export(self):
print('开始批量导出')
print(self.sub_type, self.message_types)
print(len(self.contact))
print([contact.remark for contact in self.contact])
self.batch_num_total = len(self.contact) * len(self.sub_type)
self.batch_num = 0
self.rangeSignal.emit(self.batch_num_total)
for contact in self.contact:
# print('联系人', contact.remark)
for type_ in self.sub_type:
# print('导出类型', type_)
if type_ == self.DOCX:
self.to_docx(contact, self.message_types, True)
elif type_ == self.TXT:
# print('批量导出txt')
self.to_txt(contact, self.message_types, True)
elif type_ == self.AI_TXT:
# print('批量导出txt')
self.to_ai_txt(contact, self.message_types, True)
elif type_ == self.CSV:
self.to_csv(contact, self.message_types, True)
elif type_ == self.HTML:
self.to_html(contact, self.message_types, True)
elif type_ == self.JSON:
self.to_json(contact,self.message_types,True)
def batch_finish_one(self, num):
self.nowContact.emit(self.contact[self.batch_num // len(self.sub_type)].remark)
self.batch_num += 1
if self.batch_num == self.batch_num_total:
self.okSignal.emit(1)
def merge_docx(self, n):
conRemark = self.contact.remark
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', conRemark)
filename = f"{origin_path}/{conRemark}_{n}.docx"
if n == 10086:
# self.document.append(self.document)
file = os.path.join(origin_path, f'{conRemark}.docx')
try:
self.document.save(file)
except PermissionError:
file = file[:-5] + f'{time.time()}' + '.docx'
self.document.save(file)
self.okSignal.emit(1)
return
doc = docx.Document(filename)
self.document.append(doc)
os.remove(filename)
if n % 50 == 0:
# self.document.append(self.document)
file = os.path.join(origin_path, f'{conRemark}-{n // 50}.docx')
try:
self.document.save(file)
except PermissionError:
file = file[:-5] + f'{time.time()}' + '.docx'
self.document.save(file)
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
self.document = Composer(doc)
def to_docx(self, contact, message_types, is_batch=False):
doc = docx.Document()
doc.styles["Normal"].font.name = "Cambria"
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
self.document = Composer(doc)
Child = DocxExporter(contact, type_=self.DOCX, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.merge_docx if not is_batch else self.batch_finish_one)
Child.start()
def to_json(self, contact, message_types, is_batch=False):
Child = JsonExporter(contact, type_=self.JSON, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
Child.start()
def to_txt(self, contact, message_types, is_batch=False):
Child = TxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
Child.start()
def to_ai_txt(self, contact, message_types, is_batch=False):
Child = AiTxtExporter(contact, type_=self.TXT, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
Child.start()
def to_html(self, contact, message_types, is_batch=False):
Child = HtmlExporter(contact, type_=self.output_type, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.count_finish_num)
Child.start()
self.total_num = 1
if message_types.get(34):
# 语音消息单独的线程
self.total_num += 1
output_media = OutputMedia(contact, time_range=self.time_range)
self.children.append(output_media)
output_media.okSingal.connect(self.count_finish_num)
output_media.progressSignal.connect(self.progressSignal)
output_media.start()
if message_types.get(47):
# emoji消息单独的线程
self.total_num += 1
output_emoji = OutputEmoji(contact, time_range=self.time_range)
self.children.append(output_emoji)
output_emoji.okSingal.connect(self.count_finish_num)
output_emoji.progressSignal.connect(self.progressSignal)
output_emoji.start()
if message_types.get(3):
# 图片消息单独的线程
self.total_num += 1
output_image = OutputImage(contact, time_range=self.time_range)
self.children.append(output_image)
output_image.okSingal.connect(self.count_finish_num)
output_image.progressSignal.connect(self.progressSignal)
output_image.start()
def to_csv(self, contact, message_types, is_batch=False):
Child = CSVExporter(contact, type_=self.CSV, message_types=message_types, time_range=self.time_range)
self.children.append(Child)
Child.progressSignal.connect(self.progress)
if not is_batch:
Child.rangeSignal.connect(self.rangeSignal)
Child.okSignal.connect(self.okSignal if not is_batch else self.batch_finish_one)
Child.start()
def run(self):
if self.output_type == self.DOCX:
self.to_docx(self.contact, self.message_types)
elif self.output_type == self.CSV_ALL:
self.to_csv_all()
elif self.output_type == self.CONTACT_CSV:
self.contact_to_csv()
elif self.output_type == self.TXT:
self.to_txt(self.contact, self.message_types)
elif self.output_type == self.AI_TXT:
self.to_ai_txt(self.contact, self.message_types)
elif self.output_type == self.CSV:
self.to_csv(self.contact, self.message_types)
elif self.output_type == self.HTML:
self.to_html(self.contact, self.message_types)
elif self.output_type == self.JSON:
self.to_json(self.contact, self.message_types)
elif self.output_type == self.Batch:
self.batch_export()
def count_finish_num(self, num):
"""
记录子线程完成个数
@param num:
@return:
"""
self.num += 1
if self.num == self.total_num:
# 所有子线程都完成之后就发送完成信号
if self.output_type == self.Batch:
self.batch_finish_one(1)
else:
self.okSignal.emit(1)
self.num = 0
def cancel(self):
self.requestInterruption()
class OutputMedia(QThread):
"""
导出语音消息
"""
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact, time_range=None):
super().__init__()
self.contact = contact
self.time_range = time_range
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages_by_type(self.contact.wxid, 34, time_range=self.time_range)
for message in messages:
is_send = message[4]
msgSvrId = message[9]
try:
audio_path = media_msg_db.get_audio(msgSvrId, output_path=origin_path + "/voice")
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(34)
class OutputEmoji(QThread):
"""
导出表情包
"""
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact, time_range=None):
super().__init__()
self.contact = contact
self.time_range = time_range
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages_by_type(self.contact.wxid, 47, time_range=self.time_range)
for message in messages:
str_content = message[7]
try:
pass
# emoji_path = get_emoji(str_content, thumb=True, output_path=origin_path + '/emoji')
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(47)
class OutputImage(QThread):
"""
导出图片
"""
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact, time_range):
super().__init__()
self.contact = contact
self.child_thread_num = 2
self.time_range = time_range
self.child_threads = [0] * (self.child_thread_num + 1)
self.num = 0
def count1(self, num):
self.num += 1
print('图片导出完成一个')
if self.num == self.child_thread_num:
self.okSingal.emit(47)
print('图片导出完成')
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
messages = msg_db.get_messages_by_type(self.contact.wxid, 3, time_range=self.time_range)
base_path = os.path.join(OUTPUT_DIR, '聊天记录', self.contact.remark, 'image')
for message in messages:
str_content = message[7]
BytesExtra = message[10]
timestamp = message[5]
try:
image_path = hard_link_db.get_image(str_content, BytesExtra, up_dir=Me().wx_dir, thumb=False)
image_path = get_image(image_path, base_path=base_path)
try:
os.utime(origin_path + image_path[1:], (timestamp, timestamp))
except:
pass
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(47)
class OutputImageChild(QThread):
okSingal = pyqtSignal(int)
progressSignal = pyqtSignal(int)
def __init__(self, contact, messages, time_range):
super().__init__()
self.contact = contact
self.messages = messages
self.time_range = time_range
def run(self):
origin_path = os.path.join(os.getcwd(), OUTPUT_DIR, '聊天记录', self.contact.remark)
for message in self.messages:
str_content = message[7]
BytesExtra = message[10]
timestamp = message[5]
try:
image_path = hard_link_db.get_image(str_content, BytesExtra, thumb=False)
if not os.path.exists(os.path.join(Me().wx_dir, image_path)):
image_thumb_path = hard_link_db.get_image(str_content, BytesExtra, thumb=True)
if not os.path.exists(os.path.join(Me().wx_dir, image_thumb_path)):
continue
image_path = image_thumb_path
image_path = get_image(image_path, base_path=f'/data/聊天记录/{self.contact.remark}/image')
try:
os.utime(origin_path + image_path[1:], (timestamp, timestamp))
except:
pass
except:
logger.error(traceback.format_exc())
finally:
self.progressSignal.emit(1)
self.okSingal.emit(47)
print('图片子线程完成')
if __name__ == "__main__":
pass