Commit 282af7b7 authored by Yaowentong's avatar Yaowentong

1

parents
import json
import os
from functools import wraps
from loguru import logger
from flask import jsonify
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
logger.remove()
logger.add(
os.path.join(LOG_DIR, "info_{time:YYYY-MM-DD}.log"),
level="INFO",
rotation="00:00",
retention="7 days",
encoding="utf-8",
enqueue=True,
backtrace=False
)
# 配置ERROR级别日志 - 每日滚动,保留7天
logger.add(
os.path.join(LOG_DIR, "err_{time:YYYY-MM-DD}.log"),
level="ERROR",
rotation="00:00",
retention="7 days",
encoding="utf-8",
enqueue=True,
backtrace=True,
diagnose=True
)
class ApiResponse:
@staticmethod
def success(data=None, msg="success"):
response = {
"code": 200,
"data": data,
"msg": msg
}
logger.info(f"dso-ai-bot: {msg}, 数据: {data}")
return jsonify(response)
@staticmethod
def error(msg="服务器内部错误", code=500):
response = {
"code": code,
"data": None,
"msg": msg
}
logger.error(f"dso-ai-bot: {code}, exception: {msg}")
return jsonify(response)
def handle_exceptions(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
result = func(*args, **kwargs)
return ApiResponse.success(result)
except Exception as e:
logger.exception(f"function {func.__name__} exception")
return ApiResponse.error(str(e), 500)
return wrapper
import requests
import json
api_key = "Bearer fcc424e5-58af-494d-9683-5787413a26c9"
def get_ai_chat_history_lite(messages, stream=False):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "ep-m-20250408105049-7dj9r",
"stream": stream,
"messages": messages
})
headers = {
'Authorization': api_key,
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
ai_reply = data.get('choices', [{}])[0].get('message', {})
total_tokens = data.get('usage', {}).get('total_tokens')
return ai_reply, total_tokens
# chat
def get_ai_chat_history(messages, stream=False):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "ep-20241214232538-x27zp",
"stream": stream,
"messages": messages
})
headers = {
'Authorization': api_key,
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
ai_reply = data.get('choices', [{}])[0].get('message', {})
total_tokens = data.get('usage', {}).get('total_tokens')
return ai_reply, total_tokens
# rag_联网插件
def get_bot_with_history(messages):
url = "https://ark.cn-beijing.volces.com/api/v3/bots/chat/completions"
payload = json.dumps({
"model": "bot-20250730164611-nbsng",
"stream": False,
"stream_options": {
"include_usage": False
},
"messages":messages
})
headers = {
'Authorization': api_key,
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
ai_reply = data.get('choices', [{}])[0].get('message', {})
total_tokens = data.get('bot_usage', {}).get('model_usage')[0].get('total_tokens')
return ai_reply, total_tokens
def get_bot_with_history_rag(messages):
url = "https://ark.cn-beijing.volces.com/api/v3/bots/chat/completions"
payload = json.dumps({
"model": "bot-20250725180737-mjxbx",
"stream": False,
"stream_options": {
"include_usage": False
},
"messages":messages
})
headers = {
'Authorization': api_key,
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
ai_reply = data.get('choices', [{}])[0].get('message', {})
total_tokens = data.get('bot_usage', {}).get('model_usage')[0].get('total_tokens')
return ai_reply, total_tokens
def get_image_to_text(content):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "doubao-seed-2-0-lite-260215",
"messages": [
{
"content": content,
"role": "user"
}
]
})
headers = {
'Authorization': api_key,
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
ai_reply = data.get('choices', [{}])[0].get('message', {})
total_tokens = data.get('usage', {}).get('total_tokens')
return ai_reply, total_tokens
if __name__ == '__main__':
content = [
{"type": "image_url","image_url": {"url": "https://douchacha-web.tos-cn-beijing.volces.com/assets/6583982362302876935/cover.jpg"}},
# {"type": "image_url","image_url": {"url": "https://douchacha-web.tos-cn-beijing.volces.com/assets/7283099688222379304/1.jpg"}},
{"type": "text", "text": "提取图片中的文字"},
]
print(get_image_to_text(content))
# list = []
# for i in range(10):
# list.append(f"https://douchacha-web.tos-cn-beijing.volces.com/assets/7283099688222379304/{i}.jpg")
# print(list)
import time
import json
import requests
import functools
from typing import Optional, Dict, Any
# 配置类
class Config:
APP_ID = '6523591376'
TOKEN = '8jkXUl2u90wL1drhXMfvs65Cq2JMVhwT'
CLUSTER = 'volc_auc_common'
SERVICE_URL = 'https://openspeech.bytedance.com/api/v1/auc'
MAX_RETRY_COUNT = 50
MAX_TASK_ATTEMPTS = 500
# 重试装饰器
def retry(max_attempts: int, delay: int = 5):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_attempts):
try:
return func(*args, **kwargs)
except Exception as e:
if attempt < max_attempts - 1:
time.sleep(delay)
return None
return wrapper
return decorator
class SpeechRecognizer:
def __init__(self):
self.headers = {
'Authorization': f'Bearer; {Config.TOKEN}',
'Content-Type': 'application/json'
}
@retry(Config.MAX_RETRY_COUNT)
def _submit_task(self, audio_url: str) -> Optional[str]:
"""提交语音识别任务(内部方法)"""
request_data = {
"app": {
"appid": Config.APP_ID,
"token": Config.TOKEN,
"cluster": Config.CLUSTER
},
"user": {
"uid": "dcc_live"
},
"audio": {
"format": "mp3",
"url": audio_url
},
"additions": {
'with_speaker_info': 'False',
}
}
response = requests.post(
f"{Config.SERVICE_URL}/submit",
data=json.dumps(request_data),
headers=self.headers
)
response.raise_for_status()
return response.json()['resp']['id']
@retry(Config.MAX_RETRY_COUNT)
def _query_task(self, task_id: str) -> Optional[Dict[str, Any]]:
query_data = {
'appid': Config.APP_ID,
'token': Config.TOKEN,
'id': task_id,
'cluster': Config.CLUSTER
}
response = requests.post(
f"{Config.SERVICE_URL}/query",
data=json.dumps(query_data),
headers=self.headers
)
response.raise_for_status()
return response.json()
@retry(Config.MAX_RETRY_COUNT)
def recognize_audio(self, audio_url: str) -> Optional[Dict[str, Any]]:
task_id = self._submit_task(audio_url)
if not task_id:
return None
for attempt in range(Config.MAX_TASK_ATTEMPTS):
result = self._query_task(task_id)
if not result:
return None
code = result['resp']['code']
if code >= 2000:
time.sleep(5)
continue
elif code == 1000:
return result.get('resp').get('text')
else:
return None
return None
#
# if __name__ == "__main__":
# recognizer = SpeechRecognizer()
# audio_url = "https://douchacha-web.tos-cn-beijing.volces.com/assets/7531557698396048640/7531557698396048640.mp4" # 替换为实际音频URL
# result = recognizer.recognize_audio(audio_url)
# print(result)
\ No newline at end of file
from flask import Flask, request
import os, sys
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(base_path)
from ai_chat import parse_utils, sys_content
from ApiResponse import handle_exceptions
from ai_utils import get_bot_with_history, get_ai_chat_history, get_bot_with_history_rag
from parse_utils import get_last_user_value, convert_dialog_format
app = Flask(__name__)
@app.route('/api/chat', methods=['POST'])
@handle_exceptions
def get_chat_with_history():
data = request.get_json()
sys_list = [{"role": "system", "content": sys_content.action_content}]
ai_reply, total_tokens = get_ai_chat_history(sys_list + get_last_user_value(data))
if ai_reply.get('content') == '1':
mag = parse_utils.extract_mp4_to_text(get_last_user_value(data)[0].get('content'))
return mag
elif ai_reply.get('content') == '2':
sys_list = [{"role": "system", "content": sys_content.douyin_text_fix}]
ai_reply, total_tokens = get_ai_chat_history(sys_list + convert_dialog_format(data))
return ai_reply
elif ai_reply.get('content') == '3':
sys_list = [{"role": "system", "content": sys_content.helpful_content_text_rag}]
ai_reply, total_tokens = get_bot_with_history_rag(sys_list + convert_dialog_format(data))
return ai_reply
else:
sys_list = [{"role": "system", "content": sys_content.helpful_content_text}]
ai_reply, total_tokens = get_bot_with_history(sys_list + convert_dialog_format(data))
return ai_reply
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8088, debug=True)
# print(get_chat_with_history([{'user': '提取文案https://www.douyin.com/video/7525861419037855018'}]))
import re
from typing import Dict, List, Callable
import ai_utils
import spider_api
import asr_utils
import tos_utlis
import json
class PlatformParser:
@staticmethod
def parse_douyin(link: str) -> Dict:
if "douyin" in link:
return {"is_dy": True, "type": "video", "url": link}
@staticmethod
def parse_xiaohongshu(link: str) -> Dict:
if "xiaohongshu" in link:
return {"is_xhs": True, "type": "video", "url": link}
@staticmethod
def parse_bzhan(link: str) -> Dict:
if "bilibili" in link:
return {"is_bili": True, "type": "video", "url": link}
@staticmethod
def parse_ks(link: str) -> Dict:
if "kuaishou" in link:
return {"is_ks": True, "type": "video", "url": link}
def parse_url(link: str) -> Dict:
parsers = {
"dy": PlatformParser.parse_douyin,
"xhs": PlatformParser.parse_xiaohongshu,
"bili": PlatformParser.parse_bzhan,
"ks": PlatformParser.parse_ks
}
for platform, parser in parsers.items():
result = parser(link)
if not result:
continue
if result.get(f"is_{platform}"):
return {
"platform": platform,
"type": result.get("type", "unknown"),
"url": result.get("url", link)
}
return {
"platform": "unknown",
"type": "unknown",
"url": link
}
PROCESSOR_REGISTRY: Dict[str, Dict[str, Callable]] = {
"dy": {
"video": lambda url, data: print(f"处理dy视频: {url}")
},
"xhs": {
"video": lambda url, data: print(f"处理xhs视频: {url}")
},
"bili": {
"video": lambda url, data: print(f"处理bili视频: {url}")
},
"ks": {
"video": lambda url, data: print(f"处理ks视频: {url}")
}
}
def register_processor(platform: str, content_type: str):
def decorator(func):
if platform not in PROCESSOR_REGISTRY:
PROCESSOR_REGISTRY[platform] = {}
PROCESSOR_REGISTRY[platform][content_type] = func
return func
return decorator
############################################################
# 抖音
@register_processor("dy", "video")
def process_douyin_video(url: str, data: Dict):
data = spider_api.get_dy_ocr_info(url)
if not data:
result = {
"platform": "dy",
"type": "xxx",
"base_info": url,
"text": {
"content": "链接有误",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(f"assets/{data.get('video_id')}/{data.get('video_id')}.json"):
return tos_utlis.get_string_from_tos(f"assets/{data.get('video_id')}/{data.get('video_id')}.json")
else:
if data.get('aweme_type') == 68:
url_list = []
for i in range(data.get('image_length')):
if tos_utlis.check_file_in_tos(f"assets/{data.get('video_id')}/{i}.jpg"):
url_list.append({
"type": "image_url",
"image_url": {
"url": f"https://douchacha-web.tos-cn-beijing.volces.com/assets/{data.get('video_id')}/{i}.jpg"}
})
else:
result = {
"platform": "dy",
"type": "image",
"base_info": data,
"text": {
"content": "图文提取失败请稍后重试",
"role": "assistant"
}
}
return result
url_list.append({"type": "text", "text": "提取图片中的文字仅返回图中文字不要返回"})
ai_reply, total_tokens = ai_utils.get_image_to_text(url_list)
result = {
"platform": "dy",
"type": "image",
"base_info": data,
"text": ai_reply
}
tos_utlis.put_string_to_tos(
f"assets/{data.get('video_id')}/{data.get('video_id')}.json",
result)
return result
else:
if data.get("duration") / 1000 / 60 > 20:
result = {
"platform": "dy",
"type": "other",
"base_info": data,
"text": {
"content": "该视频时长超出20min暂不支持提取",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(f"assets/{data.get('video_id')}/{data.get('video_id')}.mp4"):
recognizer = asr_utils.SpeechRecognizer()
ai_reply = recognizer.recognize_audio(
f"https://douchacha-web.tos-cn-beijing.volces.com/assets/{data.get('video_id')}/{data.get('video_id')}.mp4")
result = {
"platform": "dy",
"type": "video",
"base_info": data,
"text": {
"content": ai_reply,
"role": "assistant"
}
}
tos_utlis.put_string_to_tos(
f"assets/{data.get('video_id')}/{data.get('video_id')}.json",
result)
return result
else:
result = {
"platform": "dy",
"type": "video",
"base_info": data,
"text": {
"content": "视频提取失败请稍后重试",
"role": "assistant"
}
}
return result
############################################################
# 小红书
@register_processor("xhs", "video")
def process_xhs_video(url: str, data: Dict):
data = spider_api.get_xhs_info(url)
if not data:
result = {
"platform": "xhs",
"type": "other",
"base_info": url,
"text": {
"content": "链接有误",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(
f"xhs/{data.get('note_id')}_{data.get('xsec_token')}/{data.get('note_id')}_{data.get('xsec_token')}.json"):
return tos_utlis.get_string_from_tos(
f"xhs/{data.get('note_id')}_{data.get('xsec_token')}/{data.get('note_id')}_{data.get('xsec_token')}.json")
else:
if data.get('type') == 'video':
if data.get("duration") / 1000 / 60 > 20:
result = {
"platform": "xhs",
"type": "other",
"base_info": data,
"text": {
"content": "该视频时长超出20min暂不支持提取",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(
f"xhs/{data.get('note_id')}_{data.get('xsec_token')}/{data.get('note_id')}_{data.get('xsec_token')}.mp4"):
recognizer = asr_utils.SpeechRecognizer()
ai_reply = recognizer.recognize_audio(
f"https://douchacha-web.tos-cn-beijing.volces.com/xhs/{data.get('note_id')}_{data.get('xsec_token')}/{data.get('note_id')}_{data.get('xsec_token')}.mp4")
result = {
"platform": "xhs",
"type": "video",
"base_info": data,
"text": {
"content": ai_reply,
"role": "assistant"
}
}
tos_utlis.put_string_to_tos(
f"xhs/{data.get('note_id')}_{data.get('xsec_token')}/{data.get('note_id')}_{data.get('xsec_token')}.json",
result)
return result
else:
result = {
"platform": "xhs",
"type": "video",
"base_info": data,
"text": {
"content": "视频提取失败请稍后重试",
"role": "assistant"
}
}
return result
else:
url_list = []
for i in range(data.get('image_length')):
if tos_utlis.check_file_in_tos(f"xhs/{data.get('note_id')}_{data.get('xsec_token')}/{i}.jpg"):
url_list.append({
"type": "image_url",
"image_url": {
"url": f"https://douchacha-web.tos-cn-beijing.volces.com/xhs/{data.get('note_id')}_{data.get('xsec_token')}/{i}.jpg"}
})
else:
result = {
"platform": "xhs",
"type": "image",
"base_info": data,
"text": {
"content": "图片提取失败请稍后重试",
"role": "assistant"
}
}
return result
url_list.append({"type": "text", "text": "提取图片中的文字仅返回图中文字"})
ai_reply, total_tokens = ai_utils.get_image_to_text(url_list)
result = {
"platform": "xhs",
"type": "image",
"base_info": data,
"text": ai_reply
}
tos_utlis.put_string_to_tos(
f"xhs/{data.get('note_id')}_{data.get('xsec_token')}/{data.get('note_id')}_{data.get('xsec_token')}.json",
result)
return result
############################################################
"""
快手
"""
@register_processor("ks", "video")
def process_ks_video(url: str, data: Dict):
data = spider_api.get_ks_info(url)
if not data:
result = {
"platform": "ks",
"type": "other",
"base_info": url,
"text": {
"content": "链接有误",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(
f"ks/{data.get('video_id')}/{data.get('video_id')}.json"):
return tos_utlis.get_string_from_tos(
f"ks/{data.get('video_id')}/{data.get('video_id')}.json")
if data.get("duration") / 1000 / 60 > 20:
result = {
"platform": "ks",
"type": "other",
"base_info": data,
"text": {
"content": "该视频时长超出20min暂不支持提取",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(f"ks/{data.get('video_id')}/{data.get('video_id')}.mp4"):
recognizer = asr_utils.SpeechRecognizer()
ai_reply = recognizer.recognize_audio(
f"https://douchacha-web.tos-cn-beijing.volces.com/ks/{data.get('video_id')}/{data.get('video_id')}.mp4")
result = {
"platform": "ks",
"type": "video",
"base_info": data,
"text": {
"content": ai_reply,
"role": "assistant"
}
}
tos_utlis.put_string_to_tos(
f"ks/{data.get('video_id')}/{data.get('video_id')}.json",
result)
return result
else:
result = {
"platform": "ks",
"type": "video",
"base_info": data,
"text": {
"content": "视频下载失败请稍后重试",
"role": "assistant"
}
}
return result
############################################################
# b站
@register_processor("bili", "video")
def process_bzhan_video(url: str, data: Dict):
data = spider_api.get_bilibili_info(url)
if not data:
result = {
"platform": "bili",
"type": "other",
"base_info": url,
"text": {
"content": "链接有误",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(
f"bilibili/{data.get('bvid')}/{data.get('bvid')}.json"):
return tos_utlis.get_string_from_tos(
f"bilibili/{data.get('bvid')}/{data.get('bvid')}.json")
if data.get("duration") / 60 > 50:
result = {
"platform": "bili",
"type": "other",
"base_info": data,
"text": {
"content": "该视频时长超出20min暂不支持提取",
"role": "assistant"
}
}
return result
if tos_utlis.check_file_in_tos(f"bilibili/{data.get('bvid')}/{data.get('bvid')}.mp3"):
recognizer = asr_utils.SpeechRecognizer()
ai_reply = recognizer.recognize_audio(
f"https://douchacha-web.tos-cn-beijing.volces.com/bilibili/{data.get('bvid')}/{data.get('bvid')}.mp3")
result = {
"platform": "bili",
"type": "video",
"base_info": data,
"text": {
"content": ai_reply,
"role": "assistant"
}
}
tos_utlis.put_string_to_tos(
f"bilibili/{data.get('bvid')}/{data.get('bvid')}.json",
result)
return result
else:
result = {
"platform": "bili",
"type": "video",
"base_info": data,
"text": {
"content": "视频下载失败请稍后重试",
"role": "assistant"
}
}
return result
############################################################
def process_urls(urls: List[str]):
results = []
for url in urls:
parsed_data = parse_url(url)
platform = parsed_data["platform"]
content_type = parsed_data["type"]
platform_processors = PROCESSOR_REGISTRY.get(platform, {})
processor = platform_processors.get(content_type)
if processor:
result = processor(url, parsed_data)
results.append(result)
else:
results.append(f"无法处理的链接: {url}")
return results
def extract_mp4_to_text(text: str):
# url提取
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
urls = url_pattern.findall(text)
# 汉字提取
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]+')
chinese_chars = chinese_pattern.findall(text)
data = process_urls(urls)
return data
# return urls
def convert_dialog_format(original_dialog: list) -> list:
converted = []
for item in original_dialog:
if 'user' in item:
converted.append({
'role': 'user',
'content': item['user']
})
elif 'assistant' in item:
converted.append({
'role': 'assistant',
'content': item['assistant']
})
else:
continue
return converted
# 获取最新用户
def get_last_user_value(dialogs: list) -> str or None:
last_user_content = None
for item in dialogs:
if 'user' in item:
last_user_content = item['user']
if last_user_content is not None:
return [{'role': 'user', 'content': last_user_content}]
return None
def transform_image_urls(url_list: List[str]) -> List[Dict]:
return [
{
"type": "image_url",
"image_url": {"url": url}
}
for url in url_list
]
import requests
import urllib.parse
import re
from urllib.parse import quote_plus
"""
快手
"""
def is_valid_kuaishou_profile(url: str) -> bool:
pattern = re.compile(r'''
^https:// # 必须以https协议开头
www\.kuaishou\.com # 固定域名
/f/ # 固定路径前缀
[0-9a-zA-Z_-]{10,20} # 用户ID:10-20位数字、字母、下划线或连字符
$ # 字符串结束
''', re.VERBOSE)
return bool(pattern.match(url))
def get_ks_info(video_url):
url = ""
if is_valid_kuaishou_profile(video_url):
url = f"http://172.16.18.10:8875/api/v1/ks?share_url={quote_plus(video_url)}"
payload={}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
"""
小红书
"""
def is_valid_xiaohongshu_link(url: str) -> bool:
pattern = re.compile(
r'^https://www\.xiaohongshu\.com/(?:discovery|explore)/'
r'(?:item/)?[0-9a-fA-F]{24}\?'
r'(?:[^&]+&)*?'
r'xsec_token=[^&]+'
r'(?:&[^&]+)*?$'
)
return bool(pattern.match(url))
def get_xhs_info(video_url):
url = ""
if is_valid_xiaohongshu_link(video_url):
url = f"http://172.16.18.10:8875/api/v1/xhs?share_url={quote_plus(video_url)}"
payload = {}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
"""
抖音图文
"""
def get_dy_ocr_info(video_url):
url = ""
pattern1 = r'(?:douyin\.com/(?:video|share/video|note|jingxuan)/|video_id|modal_id=)(\d+)'
match1 = re.search(pattern1, video_url)
if match1:
url = f"http://172.16.18.10:8875/api/v1/ocr?video_id={match1.group(1)}"
pattern = r'https?://v\.douyin\.com/[a-zA-Z0-9]+/?'
pattern = r'https?://v\.douyin\.com/[a-zA-Z0-9-]+/?'
if re.search(pattern, video_url):
url = f"http://172.16.18.10:8875/api/v1/ocr?video_share_url={video_url}"
payload = {}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
def is_valid_bilibili_video(url: str) -> bool:
pattern = re.compile(r'''
^https:// # 必须以https协议开头
www\.bilibili\.com # 固定域名
/video/ # 固定路径
BV[0-9a-zA-Z]{10,30} # BV号(放宽长度限制为10-15位)
/?\? # 允许视频ID后有一个可选的斜杠,再必须跟问号
.+ # 问号后至少有一个参数
$ # 字符串结束
''', re.VERBOSE)
return bool(pattern.match(url))
def get_bilibili_info(video_url):
url = ""
if is_valid_bilibili_video(video_url):
url = f"http://172.16.18.10:8875/api/v1/bilibili?share_url={quote_plus(video_url)}"
payload = {}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
if __name__ == '__main__':
# url= "https://v.douyin.com/--n8QjSuvh4/"
url= "https://www.douyin.com/jingxuan?modal_id=7584179438045121842"
print(get_dy_ocr_info(url))
\ No newline at end of file
action_content = """
判断其与文案提取或文案改写的相关性,并给出相应回复。
1: 判断输入内容并回复
1.1. 与文案提取相关时并且域名中包含抖音快手小红书b站的域名时,必须包含一个url,返回1。
1.2. 与文案改写相关时 返回2 。
1.3 与dso抖音搜索相关时返回3
- 仅回复1,2,3不提供其他额外信息
"""
douyin_text_fix = """
# 角色
你是一位专业的抖音短视频脚本改写专家,能够在遵循原文逻辑的基础上,对给定的抖音短视频脚本进行改写。要以自然、简洁的口语化方式表达,模仿真人说话的语气,坚决不使用排比句以及复杂的词汇。同时,务必保留人物名、时间、地点、数字、政策名称等核心细节,保证原文关键信息不改变。
## 技能
### 技能 1: 抖音短视频脚本改写
1. 接收用户提供的抖音短视频脚本后,认真研读脚本内容,精准把握原文逻辑。
2. 逐句对脚本进行改写,使用简单易懂、贴近生活的口语化表述,摒弃排比句和复杂生僻的词汇。
3. 完成改写后,仔细核对,确保所有核心细节及关键信息与原文一致。
## 限制
- 仅围绕抖音短视频脚本改写提供服务,不回答与脚本改写无关的问题。
- 改写后的脚本必须严格遵循原文逻辑,核心细节和关键信息不得有误。
- 表述要口语化、自然、简洁,杜绝使用排比句和复杂词汇。
"""
xhs_text_fix="""
# 角色
你是一个小红书短视频脚本改写小能手,负责将给定的小红书短视频脚本进行改写。要做到与原文逻辑保持一致,用特别自然、简洁的口语化表达,就像平时真人聊天说话那种语气,千万不要用排比句,也别用复杂的词汇。一定要把人物名、时间、地点、数字、政策名称这些核心细节都保留好,保证原文关键信息不会变。
## 技能
### 技能 1: 小红书短视频脚本改写
1. 拿到用户给的小红书短视频脚本后,好好看看里面写了啥,搞清楚原文的逻辑。
2. 一句一句地改写脚本,用那种简单好懂的口语化说法,别整排比句,也别用那些复杂的词。
3. 改写完后检查检查,保证核心细节和关键信息跟原文一模一样。
## 限制
- 只做小红书短视频脚本改写这件事,别的跟改写脚本没关系的问题一概不回答。
- 改写后的脚本逻辑必须和原文一致,核心细节和关键信息不能有差错。
- 表达得口语化、自然、简洁,不能出现排比句和复杂词汇。
"""
action_content_text = """
# 角色
你是一个文案改写判断助手,能够判断用户提供的文案更倾向于是抖音文案改写需求还是小红书文案改写需求。
## 技能
### 技能 1: 判断文案改写类型
1. 接收用户提供的需要改写的文案。
2. 分析文案风格、语言特点、目标受众倾向等,判断其更符合抖音文案改写需求还是小红书文案改写需求。
3. 直接回复“抖音文案改写回复1,小红书文案改写回复2 无法判断默认是1
## 限制
- 仅回复1,2不提供其他额外信息
"""
helpful_content_text_rag = """
# 角色
你是爱搜灵犀北京爱查查开发的智能助手,你将根据用户的各种咨询需求,依据既定知识和逻辑,一步步为用户提供准确且有用的回答。
# 任务描述与要求
1. 对于用户提出的问题,需深入理解其意图,从多维度分析并给出全面的回答。
2. 回答要简洁明了,避免使用过于复杂的语言和句式,确保用户能轻松理解。
3. 若涉及专业知识,要在解释清楚的基础上,结合通俗易懂的例子辅助说明。
# 相关限制
1. 回答需基于准确的知识,不能随意编造信息。
2. 避免给出模棱两可、没有明确结论的回答。
3. 语言风格要保持友好、专业,不能使用不当或冒犯性的词汇。
不知道答案时返回 我还不太清楚你要表达什么意思
"""
helpful_content_text = """
# 角色
你是爱搜灵犀北京爱查查开发的智能助手,你将根据用户的各种咨询需求,依据既定知识和逻辑,一步步为用户提供准确且有用的回答。
# 任务描述与要求
1. 对于用户提出的问题,需深入理解其意图,从多维度分析并给出全面的回答。
2. 回答要简洁明了,避免使用过于复杂的语言和句式,确保用户能轻松理解。
3. 若涉及专业知识,要在解释清楚的基础上,结合通俗易懂的例子辅助说明。
# 相关限制
1. 回答需基于准确的知识,不能随意编造信息。
2. 避免给出模棱两可、没有明确结论的回答。
3. 语言风格要保持友好、专业,不能使用不当或冒犯性的词汇。
不知道答案时返回 我还不太清楚你要表达什么意思
## 限制 当用户问题是一个url 并且与文案提取相关时 回答链接格式错误 并且返回正确url链接样式
"""
# -*- coding: utf-8 -*-
import time
import tos
from io import StringIO
import json
TOS_AK = "AKLTMmRiMmU3YmY5ZjZjNDZkMTlhMmQxY2JkYTllYTQzNDI"
TOS_SK = "WkdWak4yUTRNakl3WVdOa05HUXdaR0V4TlRBM1l6YzJZMll5WkRnMFlUTQ=="
TOS_ENDPOINT = "tos-cn-beijing.volces.com"
TOS_REGION = "cn-beijing"
TOS_BUCKET_NAME = "douchacha-web"
def check_file_in_tos(object_key, retry_times=50):
retry_count = 0
while retry_count <= retry_times:
try:
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
client.head_object(bucket=TOS_BUCKET_NAME, key=object_key)
return True
except (tos.exceptions.TosClientError,
tos.exceptions.TosServerError,
Exception) as e:
if isinstance(e, tos.exceptions.TosServerError) and e.status_code == 404:
retry_count += 1
time.sleep(5)
if retry_count > retry_times:
return False
def put_string_to_tos( object_key, content, retry_times=3):
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
for _ in range(retry_times):
try:
client.put_object(TOS_BUCKET_NAME, object_key, content=json.dumps(content))
return True
except Exception as e:
print(e)
continue
return False
def get_string_from_tos( object_key, retry_times=3):
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
for _ in range(retry_times):
try:
response = client.get_object(TOS_BUCKET_NAME,object_key)
content = response.read()
return json.loads(content)
except tos.exceptions.TosServerError as e:
if e.status_code == 404:
print(f"文件不存在: {object_key}")
return None
except (tos.exceptions.TosClientError, json.JSONDecodeError) as e:
print(f"JSON解析失败")
return False
import json
import os
from functools import wraps
from loguru import logger
from flask import jsonify
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
logger.remove()
logger.add(
os.path.join(LOG_DIR, "info_{time:YYYY-MM-DD}.log"),
level="INFO",
rotation="00:00",
retention="7 days",
encoding="utf-8",
enqueue=True,
backtrace=False
)
# 配置ERROR级别日志 - 每日滚动,保留7天
logger.add(
os.path.join(LOG_DIR, "err_{time:YYYY-MM-DD}.log"),
level="ERROR",
rotation="00:00",
retention="7 days",
encoding="utf-8",
enqueue=True,
backtrace=True,
diagnose=True
)
class ApiResponse:
@staticmethod
def success(data=None, msg="success"):
response = {
"code": 200,
"data": data,
"msg": msg
}
logger.info(f"dso-ai-bot: {msg}, 数据: {data}")
return jsonify(response)
@staticmethod
def error(msg="服务器内部错误", code=500):
response = {
"code": code,
"data": None,
"msg": msg
}
logger.error(f"dso-ai-bot: {code}, exception: {msg}")
return jsonify(response)
def handle_exceptions(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
result = func(*args, **kwargs)
return ApiResponse.success(result)
except Exception as e:
logger.exception(f"function {func.__name__} exception")
return ApiResponse.error(str(e), 500)
return wrapper
import requests
import json
CHAT_COMPLETIONS_URL = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
BOTS_CHAT_COMPLETIONS_URL = "https://ark.cn-beijing.volces.com/api/v3/bots/chat/completions"
API_KEY = "Bearer fcc424e5-58af-494d-9683-5787413a26c9"
DEFAULT_BATCH_SIZE = 50
def _streaming_request(url, model,messages,type, batch_size=50):
payload = {
"model": model,
"stream": True,
"messages": messages,
"thinking":{
"type":type
}
}
headers = {
'Authorization': API_KEY,
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
json=payload,
headers=headers,
stream=True,
timeout=300
)
response.raise_for_status()
except requests.exceptions.RequestException as e:
err_msg = f"请求失败: {str(e)}"
def error_generator():
error_data = json.dumps({"message": err_msg, "type": 0}, ensure_ascii=False)
yield f"data: {error_data}\n\n"
return error_generator()
def generate_sse():
buffer = ""
for line in response.iter_lines():
if not line:
continue
try:
line_str = line.decode('utf-8').lstrip('data: ').strip()
if line_str == '[DONE]':
break
chunk = json.loads(line_str)
if isinstance(chunk, dict) and "choices" in chunk:
choice = chunk["choices"][0] if chunk["choices"] else None
if choice and "delta" in choice:
delta = choice["delta"]
reasoning_content = delta.get("reasoning_content")
if reasoning_content:
reasoning_data = json.dumps({
"message": reasoning_content,
"type": 4
}, ensure_ascii=False)
yield f"data: {reasoning_data}\n\n"
content = delta.get("content")
if content:
buffer += content
if len(buffer) >= batch_size:
data = json.dumps({
"message": buffer,
"type": 1
}, ensure_ascii=False)
yield f"data: {data}\n\n"
buffer = ""
except json.JSONDecodeError:
continue
except Exception as e:
error_data = json.dumps({
"message": f"处理数据错误: {str(e)}",
"type": 0
}, ensure_ascii=False)
yield f"data: {error_data}\n\n"
break
if buffer:
data = json.dumps({"message": buffer, "type": 1},
ensure_ascii=False)
yield f"data: {data}\n\n"
return generate_sse()
def _streaming_request_comment(url, model,messages,type, batch_size=50):
payload = {
"model": model,
"stream": True,
"messages": messages,
"thinking":{
"type":type
}
}
headers = {
'Authorization': API_KEY,
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
json=payload,
headers=headers,
stream=True,
timeout=300
)
response.raise_for_status()
except requests.exceptions.RequestException as e:
err_msg = f"请求失败: {str(e)}"
def error_generator():
error_data = json.dumps({"message": err_msg, "type": 0}, ensure_ascii=False)
yield f"data: {error_data}\n\n"
return error_generator()
def generate_sse():
buffer = ""
for line in response.iter_lines():
if not line:
continue
try:
line_str = line.decode('utf-8').lstrip('data: ').strip()
if line_str == '[DONE]':
break
chunk = json.loads(line_str)
if isinstance(chunk, dict) and "choices" in chunk:
choice = chunk["choices"][0] if chunk["choices"] else None
if choice and "delta" in choice:
delta = choice["delta"]
reasoning_content = delta.get("reasoning_content")
if reasoning_content:
reasoning_data = json.dumps({
"message": reasoning_content,
"type": 4
}, ensure_ascii=False)
yield f"data: {reasoning_data}\n\n"
content = delta.get("content")
if content:
buffer += content
if len(buffer) >= batch_size:
data = json.dumps({
"message": buffer,
"type": 6
}, ensure_ascii=False)
yield f"data: {data}\n\n"
buffer = ""
except json.JSONDecodeError:
continue
except Exception as e:
error_data = json.dumps({
"message": f"处理数据错误: {str(e)}",
"type": 0
}, ensure_ascii=False)
yield f"data: {error_data}\n\n"
break
if buffer:
data = json.dumps({"message": buffer, "type": 6},
ensure_ascii=False)
yield f"data: {data}\n\n"
return generate_sse()
# 标准聊天
def get_ai_chat_history_stream(messages, type = "disabled", batch_size=DEFAULT_BATCH_SIZE):
return _streaming_request(
url=CHAT_COMPLETIONS_URL,
model="ep-20250804181425-5blp4",
messages=messages,
type=type,
batch_size=batch_size
)
# 联网聊天
def get_bot_with_history_stream(messages,type = "auto", batch_size=DEFAULT_BATCH_SIZE):
return _streaming_request(
url=BOTS_CHAT_COMPLETIONS_URL,
model="bot-20250730164611-nbsng",
messages=messages,
type=type,
batch_size=batch_size
)
# 知识库
def get_bot_with_history_rag_stream(messages,type = "auto", batch_size=DEFAULT_BATCH_SIZE):
return _streaming_request(
url=BOTS_CHAT_COMPLETIONS_URL,
model="bot-20250725180737-mjxbx",
messages=messages,
type=type,
batch_size=batch_size
)
# 图像识别
def get_image_to_text_stream(content,type = "auto", batch_size=DEFAULT_BATCH_SIZE):
messages = [{"content": content, "role": "user"}]
return _streaming_request(
url=CHAT_COMPLETIONS_URL,
model="doubao-seed-2-0-lite-260215",
messages=messages,
type=type,
batch_size=batch_size
)
def get_ai_chat_history(messages, stream=False):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "ep-20241214232538-x27zp",
"stream": stream,
"messages": messages
})
headers = {
'Authorization': API_KEY,
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
ai_reply = data.get('choices', [{}])[0].get('message', {})
total_tokens = data.get('usage', {}).get('total_tokens')
return ai_reply, total_tokens
def get_ai_chat_history_stream_comment(messages, type = "disabled", batch_size=DEFAULT_BATCH_SIZE):
return _streaming_request_comment(
url=CHAT_COMPLETIONS_URL,
model="ep-20250804181425-5blp4",
messages=messages,
type=type,
batch_size=batch_size
)
if __name__ == '__main__':
print(get_ai_chat_history_stream_comment("文案改写"))
import time
import json
import requests
import functools
from typing import Optional, Dict, Any
# 配置类
class Config:
APP_ID = '6523591376'
TOKEN = '8jkXUl2u90wL1drhXMfvs65Cq2JMVhwT'
CLUSTER = 'volc_auc_common'
SERVICE_URL = 'https://openspeech.bytedance.com/api/v1/auc'
MAX_RETRY_COUNT = 1000
MAX_TASK_ATTEMPTS = 500
# 重试装饰器
def retry(max_attempts: int, delay: int = 5):
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_attempts):
try:
return func(*args, **kwargs)
except Exception as e:
if attempt < max_attempts - 1:
time.sleep(delay)
return None
return wrapper
return decorator
class SpeechRecognizer:
def __init__(self):
self.headers = {
'Authorization': f'Bearer; {Config.TOKEN}',
'Content-Type': 'application/json'
}
@retry(Config.MAX_RETRY_COUNT)
def _submit_task(self, audio_url: str) -> Optional[str]:
"""提交语音识别任务(内部方法)"""
request_data = {
"app": {
"appid": Config.APP_ID,
"token": Config.TOKEN,
"cluster": Config.CLUSTER
},
"user": {
"uid": "dcc_live"
},
"audio": {
"format": "mp3",
"url": audio_url
},
"additions": {
'with_speaker_info': 'False',
}
}
response = requests.post(
f"{Config.SERVICE_URL}/submit",
data=json.dumps(request_data),
headers=self.headers
)
response.raise_for_status()
print(response.text)
return response.json()['resp']['id']
@retry(Config.MAX_RETRY_COUNT)
def _query_task(self, task_id: str) -> Optional[Dict[str, Any]]:
query_data = {
'appid': Config.APP_ID,
'token': Config.TOKEN,
'id': task_id,
'cluster': Config.CLUSTER
}
response = requests.post(
f"{Config.SERVICE_URL}/query",
data=json.dumps(query_data),
headers=self.headers
)
response.raise_for_status()
print("kakakaka")
return response.json()
@retry(Config.MAX_RETRY_COUNT)
def recognize_audio(self, audio_url: str) -> Optional[Dict[str, Any]]:
task_id = self._submit_task(audio_url)
if not task_id:
return None
for attempt in range(Config.MAX_TASK_ATTEMPTS):
result = self._query_task(task_id)
if not result:
return None
code = result['resp']['code']
if code >= 2000:
time.sleep(1)
print("重拾")
continue
elif code == 1000:
return result.get('resp').get('text')
else:
return None
return None
if __name__ == "__main__":
recognizer = SpeechRecognizer()
audio_url = "https://douchacha-web.tos-cn-beijing.volces.com/assets/7552163083708321058/7552163083708321058.mp4" # 替换为实际音频URL
result = recognizer.recognize_audio(audio_url)
print(result)
\ No newline at end of file
from flask import request
import os, sys
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(base_path)
from ai_chat_stream import parse_utils, sys_content,parse_comment_utils
from ai_utils import get_bot_with_history_stream, get_ai_chat_history, get_bot_with_history_rag_stream, \
get_ai_chat_history_stream
from parse_utils import get_last_user_value, convert_dialog_format, create_res_data,extract_user_contents
from flask import Flask, Response
import json
app = Flask(__name__)
def sse_generator(data):
res_data = create_res_data("正在识别用户问题", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.action_content}]
ai_reply, total_tokens = get_ai_chat_history(sys_list + get_last_user_value(data))
if ai_reply.get('content') == '1':
res_data = create_res_data("正在解析链接", 3)
yield f"data: {json.dumps(res_data,ensure_ascii=False)}\n\n".encode('utf-8')
convert_dialog_format_text = '拼接'.join(extract_user_contents(convert_dialog_format(data)))
yield from parse_utils.extract_mp4_to_text(convert_dialog_format_text)
elif ai_reply.get('content') == '2':
res_data = create_res_data("正在文案改写", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.douyin_text_fix}]
bot_result_list = get_ai_chat_history_stream(sys_list + convert_dialog_format(data))
if bot_result_list:
for result in bot_result_list:
yield result
res_data = create_res_data("WRITE", 10)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
elif ai_reply.get('content') == '3':
res_data = create_res_data("正在查询爱搜知识库", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.helpful_content_text_rag}]
bot_result_list = get_bot_with_history_rag_stream(sys_list + convert_dialog_format(data))
if bot_result_list:
for result in bot_result_list:
yield result
elif ai_reply.get('content') == '4':
res_data = create_res_data("正在获取评论", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
convert_dialog_format_text = '拼接'.join(extract_user_contents(convert_dialog_format(data)))
yield from parse_comment_utils.extract_comment_to_text(convert_dialog_format_text)
res_data = create_res_data("WRITE,VIDEO", 10)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
else:
res_data = create_res_data("正在联网搜索", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.helpful_content_text}]
bot_result_list = get_bot_with_history_stream(sys_list + convert_dialog_format(data))
if bot_result_list:
for result in bot_result_list:
yield result
@app.route('/api/chat', methods=['POST'])
def sse_stream():
data = request.get_json()
return Response(
sse_generator(data),
mimetype="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8087)
import re
from typing import Dict, List, Callable
import ai_utils
import spider_api
import asr_utils
import tos_utlis
import json
from ai_chat_stream import sys_content
def parse_url(link: str) -> Dict:
if "douyin" in link:
return {"platform": "dy", "type": "video", "url": link}
elif "xiaohongshu" in link or "xhslink" in link:
return {"platform": "xhs", "type": "video", "url": link}
elif "bilibili" in link:
return {"platform": "bili", "type": "video", "url": link}
elif "kuaishou" in link:
return {"platform": "ks", "type": "video", "url": link}
else:
return {"platform": "unknown", "type": "unknown", "url": link}
############################################################
# 抖音
def process_douyin_video(url: str, data: Dict):
try:
data = spider_api.get_dy_ocr_info(url)
if data is None or len(data) < 3: # 先判断是否为None,再判断长度
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
else:
if data.get('aweme_type') == 68:
spider_data = {
"aweme_type": "TUWEN",
"collect_count": str(data.get('collect_count')),
"comment_count": str(data.get('comment_count')),
"cover": f'https://static2.douchacha.com/assets/{data.get("video_id")}/0.jpg',
"create_time": str(data.get('create_time')),
"digg_count": str(data.get('digg_count')),
"download_count": str(data.get('download_count')),
"download_images": get_dy_download_images(int(data.get("video_id")), data.get("image_length")),
"download_video_url": str(data.get("download_video_url")),
"duration": str(data.get("duration")),
"image_length": str(data.get("image_length")),
"item_title": str(data.get("item_title")),
"recommend_count": str(data.get("recommend_count")),
"share_count": str(data.get("share_count")),
"title": str(data.get("title")),
"video_id": str(data.get("video_id")),
"url": url,
"platform": "DY"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.comment_content}]
comment = comment_process(data.get("comments"))
bot_result_list = ai_utils.get_ai_chat_history_stream_comment(sys_list + convert_dialog_format(comment),
type="disabled")
if bot_result_list:
for result in bot_result_list:
yield result
else:
spider_data = {
"aweme_type": "SHIPIN",
"collect_count": str(data.get('collect_count')),
"comment_count": str(data.get('comment_count')),
"cover": f'https://static2.douchacha.com/assets/{data.get("video_id")}/cover.jpg',
"create_time": str(data.get('create_time')),
"digg_count": str(data.get('digg_count')),
"download_count": str(data.get('download_count')),
"download_images": str(data.get("download_images")),
"download_video_url": str(data.get("download_video_url")),
"duration": str(data.get("duration")),
"image_length": str(data.get("image_length")),
"item_title": str(data.get("item_title")),
"recommend_count": str(data.get("recommend_count")),
"share_count": str(data.get("share_count")),
"title": str(data.get("title")),
"video_id": str(data.get("video_id")),
"url": url,
"platform": "DY"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.comment_content}]
comment = comment_process(data.get("comments"))
bot_result_list = ai_utils.get_ai_chat_history_stream_comment(sys_list + convert_dialog_format(comment),
type="disabled")
if bot_result_list:
for result in bot_result_list:
yield result
else:
res_data = create_res_data("视频评论获取失败", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
except Exception as e:
error_data = create_res_data(f"处理失败: {str(e)}", 0)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
# 小红书
def process_xhs_video(url: str, data: Dict):
try:
data = spider_api.get_xhs_info(url)
if data is None or len(data) < 3: # 先判断是否为None,再判断长度
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
else:
if data.get('type') == 'video':
spider_data = {
"aweme_type": "SHIPIN",
"collect_count": str(data.get('collected_count')),
"title": str(data.get("desc")),
"item_title": str(data.get("desc")),
"duration": str(data.get("duration")),
"create_time": str(data.get('time')),
"comment_count": str(data.get('comment_count')),
"digg_count": str(data.get('liked_count')),
"image_length": str(data.get("image_length")),
"share_count": str(data.get("share_count")),
"video_id": str(data.get("note_id")),
# 需要爬虫增加的字段
"cover": f"https://static2.douchacha.com/xhs/{data.get('note_id')}/cover.jpg",
"download_count": data.get('download_count', ''),
"download_images": data.get("download_images", []),
"download_video_url": data.get("download_video_url", ''),
"recommend_count": data.get("recommend_count", ''),
"url": url,
"platform": "XHS"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.comment_content}]
comment = comment_process(data.get("comments"))
bot_result_list = ai_utils.get_ai_chat_history_stream_comment(sys_list + convert_dialog_format(comment),
type="disabled")
if bot_result_list:
for result in bot_result_list:
yield result
else:
spider_data = {
"aweme_type": "TUWEN",
"collect_count": str(data.get('collected_count')),
"title": str(data.get("desc")),
"item_title": str(data.get("desc")),
"duration": str(data.get("duration")),
"create_time": str(data.get('time')),
"comment_count": str(data.get('comment_count')),
"digg_count": str(data.get('liked_count')),
"image_length": str(data.get("image_length")),
"share_count": str(data.get("share_count")),
"video_id": str(data.get("note_id")),
# 需要爬虫增加的字段
"cover": f"https://static2.douchacha.com/xhs/{data.get('note_id')}/0.jpg",
"download_count": data.get('download_count', ''),
"download_images": get_xhs_download_images(f"{data.get('note_id')}",
data.get("image_length")),
"download_video_url": data.get("download_video_url", ''),
"recommend_count": data.get("recommend_count", ''),
"url": url,
"platform": "XHS"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.comment_content}]
comment = comment_process(data.get("comments"))
bot_result_list = ai_utils.get_ai_chat_history_stream_comment(sys_list + convert_dialog_format(comment),
type="disabled")
if bot_result_list:
for result in bot_result_list:
yield result
except Exception as e:
# error_data = create_res_data(f"处理失败: {str(e)}", 0)
error_data = create_res_data(f"暂不支持", 1)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
# 快手
def process_ks_video(url: str, data: Dict):
try:
data = spider_api.get_ks_info(url)
if data is None or len(data) < 3: # 先判断是否为None,再判断长度
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
spider_data = {
"aweme_type": "SHIPIN",
"title": str(data.get("caption")),
"item_title": str(data.get("caption")),
"cover": f"https://static2.douchacha.com/ks/{data.get('video_id')}/cover.jpg",
"duration": str(data.get("duration")),
"digg_count": str(data.get('like_count')),
"video_id": str(data.get("video_id")),
# 需要爬虫增加的字段
"create_time": str(data.get('time', '')),
"comment_count": str(data.get('comment_count', '')),
"image_length": str(data.get("image_length", '')),
"share_count": str(data.get("share_count", '')),
"collect_count": str(data.get('collected_count', '')),
"download_count": str(data.get('download_count', '')),
"download_images": str(data.get("download_images", [])),
"download_video_url": str(data.get("download_video_url", '')),
"recommend_count": str(data.get("recommend_count", '')),
"url": url,
"platform": "KS"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.comment_content}]
comment = comment_process(data.get("comments"))
bot_result_list = ai_utils.get_ai_chat_history_stream_comment(sys_list + convert_dialog_format(comment),
type="disabled")
if bot_result_list:
for result in bot_result_list:
yield result
except Exception as e:
error_data = create_res_data(f"处理失败: {str(e)}", 0)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
# B站
def process_bzhan_video(url: str, data: Dict):
try:
data = spider_api.get_bilibili_info(url)
if data is None or len(data) < 3: # 先判断是否为None,再判断长度
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
spider_data = {
"aweme_type": "SHIPIN",
"title": str(data.get("title")),
"item_title": str(data.get("title")),
"cover": f"https://static2.douchacha.com/bilibili/{data.get('bvid')}/cover.jpg",
"duration": str(data.get("duration")*1000),
"digg_count": str(data.get('like_count')),
"video_id": str(data.get("like")),
# 需要爬虫增加的字段
"create_time": str(data.get('pubdate', '')),
"comment_count": str(data.get('comment_count', '')),
"image_length": str(data.get("image_length", '')),
"share_count": str(data.get("share", '')),
"collect_count": str(data.get('collected_count', '')),
"download_count": str(data.get('download_count', '')),
"download_images": str(data.get("download_images", [])),
"download_video_url": str(data.get("download_video_url", '')),
"recommend_count": str(data.get("reply", '')),
"platform": "BL",
"url": url,
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
sys_list = [{"role": "system", "content": sys_content.comment_content}]
comment = comment_process(data.get("comments"))
bot_result_list = ai_utils.get_ai_chat_history_stream_comment(sys_list + convert_dialog_format(comment),
type="disabled")
if bot_result_list:
for result in bot_result_list:
yield result
except Exception as e:
error_data = create_res_data(f"处理失败: {str(e)}", 0)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
def process_urls(urls: List[str]):
try:
url = urls[-1]
parsed_data = parse_url(url)
platform = parsed_data["platform"]
content_type = parsed_data["type"]
if platform == "dy" and content_type == "video":
yield from process_douyin_video(url, parsed_data)
elif platform == "xhs" and content_type == "video":
yield from process_xhs_video(url, parsed_data)
elif platform == "bili" and content_type == "video":
yield from process_bzhan_video(url, parsed_data)
elif platform == "ks" and content_type == "video":
yield from process_ks_video(url, parsed_data)
except Exception as e:
error_data = create_res_data(f"请提供视频或者图文链接", 1)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
def extract_comment_to_text(text: str):
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
urls = url_pattern.findall(text)
yield from process_urls(urls)
def convert_dialog_format(original_dialog: list) -> list:
converted = []
for item in original_dialog:
if 'user' in item:
converted.append({
'role': 'user',
'content': item['user']
})
elif 'assistant' in item:
converted.append({
'role': 'assistant',
'content': item['assistant']
})
else:
continue
return converted
def get_last_user_value(dialogs: list) -> str or None:
last_user_content = None
for item in dialogs:
if 'user' in item:
last_user_content = item['user']
if last_user_content is not None:
return [{'role': 'user', 'content': last_user_content}]
return None
def transform_image_urls(url_list: List[str]) -> List[Dict]:
return [
{
"type": "image_url",
"image_url": {"url": url}
}
for url in url_list
]
def create_res_data(message, type_):
return {
"message": message,
"type": type_
}
def get_dy_download_images(video_id, image_length):
download_images = []
for i in range(image_length):
download_images.append({
"image_index": i,
"image_url": f"https://static2.douchacha.com/assets/{video_id}/{i}.jpg"
})
return download_images
def get_xhs_download_images(video_id, image_length):
download_images = []
for i in range(image_length):
download_images.append({
"image_index": i,
"image_url": f"https://static2.douchacha.com/xhs/{video_id}/{i}.jpg"
})
return download_images
def comment_process(comments):
commenr_list = []
for comm in comments:
commenr_list.append(comm.get('text'))
comment = [
{
"user": ", ".join(commenr_list)
}
]
return comment
import re
from typing import Dict, List, Callable
import ai_utils
import spider_api
import asr_utils
import tos_utlis
import json
from ai_chat_stream import sys_content
def parse_url(link: str) -> Dict:
if "douyin" in link:
return {"platform": "dy", "type": "video", "url": link}
elif "xiaohongshu" in link or "xhslink" in link:
return {"platform": "xhs", "type": "video", "url": link}
elif "bilibili" in link:
return {"platform": "bili", "type": "video", "url": link}
elif "kuaishou" in link:
return {"platform": "ks", "type": "video", "url": link}
else:
return {"platform": "unknown", "type": "unknown", "url": link}
############################################################
# 抖音
def process_douyin_video(url: str, data: Dict):
try:
data = spider_api.get_dy_ocr_info(url)
if data is None or len(data) < 3: # 先判断是否为None,再判断长度
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
else:
if data.get('aweme_type') == 68:
url_list = []
for i in range(data.get('image_length')):
if tos_utlis.check_file_in_tos(f"assets/{data.get('video_id')}/{i}.jpg"):
url_list.append({
"type": "image_url",
"image_url": {
"url": f"https://douchacha-web.tos-cn-beijing.volces.com/assets/{data.get('video_id')}/{i}.jpg"}
})
spider_data = {
"aweme_type": "TUWEN",
"collect_count": str(data.get('collect_count')),
"comment_count": str(data.get('comment_count')),
"cover": f'https://static2.douchacha.com/assets/{data.get("video_id")}/0.jpg',
"create_time": str(data.get('create_time')),
"digg_count": str(data.get('digg_count')),
"download_count": str(data.get('download_count')),
"download_images": get_dy_download_images(int(data.get("video_id")), data.get("image_length")),
"download_video_url": str(data.get("download_video_url")),
"duration": str(data.get("duration")),
"image_length": str(data.get("image_length")),
"item_title": str(data.get("item_title")),
"recommend_count": str(data.get("recommend_count")),
"share_count": str(data.get("share_count")),
"title": str(data.get("title")),
"video_id": str(data.get("video_id")),
"url": url,
"platform": "DY"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
res_data = create_res_data("图文内容提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
url_list.append({"type": "text", "text": "提取图片中的文字仅返回图中文字不要返回"})
image_text_stream = ai_utils.get_image_to_text_stream(url_list)
for image_text in image_text_stream:
yield image_text
else:
buffer_url = f"assets/{data.get('video_id')}/{data.get('video_id')}_text.json"
if tos_utlis.check_file_in_tos_buff(buffer_url):
spider_data = {
"aweme_type": "SHIPIN",
"collect_count": str(data.get('collect_count')),
"comment_count": str(data.get('comment_count')),
"cover": f'https://static2.douchacha.com/assets/{data.get("video_id")}/cover.jpg',
"create_time": str(data.get('create_time')),
"digg_count": str(data.get('digg_count')),
"download_count": str(data.get('download_count')),
"download_images": str(data.get("download_images")),
"download_video_url": str(data.get("download_video_url")),
"duration": str(data.get("duration")),
"image_length": str(data.get("image_length")),
"item_title": str(data.get("item_title")),
"recommend_count": str(data.get("recommend_count")),
"share_count": str(data.get("share_count")),
"title": str(data.get("title")),
"video_id": str(data.get("video_id")),
"url": url,
"platform": "DY"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
res_data = create_res_data("视频文案提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
tos_data = tos_utlis.get_string_from_tos(buffer_url).get('text')
res_data = create_res_data(tos_data, 1)
if len(tos_data) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': tos_data})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
elif not tos_utlis.check_file_in_tos_buff(buffer_url):
if data.get("duration") / 1000 / 60 > 20:
res_data = create_res_data("该视频时长超出20min暂不支持提取", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
if tos_utlis.check_file_in_tos(f"assets/{data.get('video_id')}/{data.get('video_id')}.mp4"):
spider_data = {
"aweme_type": "SHIPIN",
"collect_count": str(data.get('collect_count')),
"comment_count": str(data.get('comment_count')),
"cover": f'https://static2.douchacha.com/assets/{data.get("video_id")}/cover.jpg',
"create_time": str(data.get('create_time')),
"digg_count": str(data.get('digg_count')),
"download_count": str(data.get('download_count')),
"download_images": str(data.get("download_images")),
"download_video_url": str(data.get("download_video_url")),
"duration": str(data.get("duration")),
"image_length": str(data.get("image_length")),
"item_title": str(data.get("item_title")),
"recommend_count": str(data.get("recommend_count")),
"share_count": str(data.get("share_count")),
"title": str(data.get("title")),
"video_id": str(data.get("video_id")),
"url": url,
"platform": "DY"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
res_data = create_res_data("视频文案提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
url = f"https://douchacha-web.tos-cn-beijing.volces.com/assets/{data.get('video_id')}/{data.get('video_id')}.mp4"
recognizer = asr_utils.SpeechRecognizer()
audio_result = recognizer.recognize_audio(url)
res_data = create_res_data("识别完成", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
if audio_result is None:
res_data = create_res_data("语音识别失败", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
else:
content = audio_result if len(audio_result) > 0 else "该视频没有文案"
res_data = create_res_data(content, 1)
if len(audio_result) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': audio_result})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
else:
res_data = create_res_data("视频提取失败请稍后重试", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
except Exception as e:
error_data = create_res_data(f"处理失败: {str(e)}", 0)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
# 小红书
def process_xhs_video(url: str, data: Dict):
try:
data = spider_api.get_xhs_info(url)
if data is None or len(data) < 3: # 先判断是否为None,再判断长度
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
else:
if data.get('type') == 'video':
spider_data = {
"aweme_type": "SHIPIN",
"collect_count": str(data.get('collected_count')),
"title": str(data.get("desc")),
"item_title": str(data.get("desc")),
"duration": str(data.get("duration")),
"create_time": str(data.get('time')),
"comment_count": str(data.get('comment_count')),
"digg_count": str(data.get('liked_count')),
"image_length": str(data.get("image_length")),
"share_count": str(data.get("share_count")),
"video_id": str(data.get("note_id")),
# 需要爬虫增加的字段
"cover": f"https://static2.douchacha.com/xhs/{data.get('note_id')}/cover.jpg",
"download_count": data.get('download_count', ''),
"download_images": data.get("download_images", []),
"download_video_url": data.get("download_video_url", ''),
"recommend_count": data.get("recommend_count", ''),
"url": url,
"platform": "XHS"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
buffer_url = f"xhs/{data.get('note_id')}/{data.get('note_id')}_text.json"
if tos_utlis.check_file_in_tos_buff(buffer_url):
tos_data = tos_utlis.get_string_from_tos(buffer_url).get('text')
res_data = create_res_data(tos_data, 1)
if len(tos_data) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': tos_data})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
elif not tos_utlis.check_file_in_tos_buff(buffer_url):
if data.get("duration") / 1000 / 60 > 20:
res_data = create_res_data("该视频时长超出20min暂不支持提取", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
if tos_utlis.check_file_in_tos(
f"xhs/{data.get('note_id')}/{data.get('note_id')}.mp4"):
url = f"https://douchacha-web.tos-cn-beijing.volces.com/xhs/{data.get('note_id')}/{data.get('note_id')}.mp4"
res_data = create_res_data("视频文案提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
recognizer = asr_utils.SpeechRecognizer()
audio_result = recognizer.recognize_audio(url)
if audio_result is None:
res_data = create_res_data("语音识别失败", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
else:
content = audio_result if len(audio_result) > 0 else "该视频没有文案"
res_data = create_res_data(content, 1)
if len(audio_result) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': audio_result})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
else:
res_data = create_res_data("视频提取失败请稍后重试", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
else:
spider_data = {
"aweme_type": "TUWEN",
"collect_count": str(data.get('collected_count')),
"title": str(data.get("desc")),
"item_title": str(data.get("desc")),
"duration": str(data.get("duration")),
"create_time": str(data.get('time')),
"comment_count": str(data.get('comment_count')),
"digg_count": str(data.get('liked_count')),
"image_length": str(data.get("image_length")),
"share_count": str(data.get("share_count")),
"video_id": str(data.get("note_id")),
# 需要爬虫增加的字段
"cover": f"https://static2.douchacha.com/xhs/{data.get('note_id')}/0.jpg",
"download_count": data.get('download_count', ''),
"download_images": get_xhs_download_images(f"{data.get('note_id')}",
data.get("image_length")),
"download_video_url": data.get("download_video_url", ''),
"recommend_count": data.get("recommend_count", ''),
"url": url,
"platform": "XHS"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
url_list = []
for i in range(data.get('image_length')):
if tos_utlis.check_file_in_tos(f"xhs/{data.get('note_id')}/{i}.jpg"):
url_list.append({
"type": "image_url",
"image_url": {
"url": f"https://douchacha-web.tos-cn-beijing.volces.com/xhs/{data.get('note_id')}/{i}.jpg"}
})
res_data = create_res_data("图文内容提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
url_list.append({"type": "text", "text": "提取图片中的文字仅返回图中文字"})
image_text_stream = ai_utils.get_image_to_text_stream(url_list)
for image_text in image_text_stream:
yield image_text
except Exception as e:
# error_data = create_res_data(f"处理失败: {str(e)}", 0)
error_data = create_res_data(f"暂不支持", 1)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
# 快手
def process_ks_video(url: str, data: Dict):
try:
data = spider_api.get_ks_info(url)
if data is None or len(data) < 3:
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
spider_data = {
"aweme_type": "SHIPIN",
"title": str(data.get("caption")),
"item_title": str(data.get("caption")),
"cover": f"https://static2.douchacha.com/ks/{data.get('video_id')}/cover.jpg",
"duration": str(data.get("duration")),
"digg_count": str(data.get('like_count')),
"video_id": str(data.get("video_id")),
# 需要爬虫增加的字段
"create_time": str(data.get('time', '')),
"comment_count": str(data.get('comment_count', '')),
"image_length": str(data.get("image_length", '')),
"share_count": str(data.get("share_count", '')),
"collect_count": str(data.get('collected_count', '')),
"download_count": str(data.get('download_count', '')),
"download_images": str(data.get("download_images", [])),
"download_video_url": str(data.get("download_video_url", '')),
"recommend_count": str(data.get("recommend_count", '')),
"url": url,
"platform": "KS"
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
buffer_url = f"ks/{data.get('video_id')}/{data.get('video_id')}_text.json"
if tos_utlis.check_file_in_tos_buff(buffer_url):
tos_data = tos_utlis.get_string_from_tos(buffer_url).get('text')
res_data = create_res_data(tos_data, 1)
if len(tos_data) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': tos_data})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
elif not tos_utlis.check_file_in_tos_buff(buffer_url):
if data.get("duration") / 1000 / 60 > 20:
res_data = create_res_data("该视频时长超出20min暂不支持提取", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
if tos_utlis.check_file_in_tos(f"ks/{data.get('video_id')}/{data.get('video_id')}.mp4"):
url = f"https://douchacha-web.tos-cn-beijing.volces.com/ks/{data.get('video_id')}/{data.get('video_id')}.mp4"
res_data = create_res_data("视频文案提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
recognizer = asr_utils.SpeechRecognizer()
audio_result = recognizer.recognize_audio(url)
if audio_result is None:
res_data = create_res_data("语音识别失败", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
else:
content = audio_result if len(audio_result) > 0 else "该视频没有文案"
res_data = create_res_data(content, 1)
if len(audio_result) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': audio_result})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
else:
res_data = create_res_data("视频提取失败请稍后重试", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
except Exception as e:
error_data = create_res_data(f"处理失败: {str(e)}", 0)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
# B站
def process_bzhan_video(url: str, data: Dict):
try:
data = spider_api.get_bilibili_info(url)
if data is None or len(data) < 3:
res_data = create_res_data("链接有误", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
spider_data = {
"aweme_type": "SHIPIN",
"title": str(data.get("title")),
"item_title": str(data.get("title")),
"cover": f"https://static2.douchacha.com/bilibili/{data.get('bvid')}/cover.jpg",
"duration": str(data.get("duration") * 1000),
"digg_count": str(data.get('like_count')),
"video_id": str(data.get("like")),
# 需要爬虫增加的字段
"create_time": str(data.get('pubdate', '')),
"comment_count": str(data.get('comment_count', '')),
"image_length": str(data.get("image_length", '')),
"share_count": str(data.get("share", '')),
"collect_count": str(data.get('collected_count', '')),
"download_count": str(data.get('download_count', '')),
"download_images": str(data.get("download_images", [])),
"download_video_url": str(data.get("download_video_url", '')),
"recommend_count": str(data.get("reply", '')),
"platform": "BL",
"url": url,
}
res_data = create_res_data(spider_data, 5)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
res_data = create_res_data("视频文案提取中", 3)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
buffer_url = f"bilibili/{data.get('bvid')}/{data.get('bvid')}.json"
if tos_utlis.check_file_in_tos_buff(buffer_url):
tos_data = tos_utlis.get_string_from_tos(buffer_url).get('text')
res_data = create_res_data(tos_data, 1)
if len(tos_data) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': tos_data})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
elif not tos_utlis.check_file_in_tos_buff(buffer_url):
if data.get("duration") / 60 > 50:
res_data = create_res_data("该视频时长超出20min暂不支持提取", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
if tos_utlis.check_file_in_tos(f"bilibili/{data.get('bvid')}/{data.get('bvid')}.mp3"):
url = f"https://douchacha-web.tos-cn-beijing.volces.com/bilibili/{data.get('bvid')}/{data.get('bvid')}.mp3"
recognizer = asr_utils.SpeechRecognizer()
audio_result = recognizer.recognize_audio(url)
if audio_result is None:
res_data = create_res_data("语音识别失败", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
else:
content = audio_result if len(audio_result) > 0 else "该视频没有文案"
res_data = create_res_data(content, 1)
if len(audio_result) > 0:
tos_utlis.put_string_to_tos(buffer_url, {'text': audio_result})
has_comments = len(data.get("comments", [])) > 0
res_ten = create_res_data(
"WRITE,COMMENT" if has_comments else "WRITE",
10
)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
yield f"data: {json.dumps(res_ten, ensure_ascii=False)}\n\n".encode('utf-8')
else:
res_data = create_res_data("视频提取失败请稍后重试", 0)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
except Exception as e:
error_data = create_res_data(f"处理失败: {str(e)}", 0)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
############################################################
def process_urls(urls: List[str]):
try:
url = urls[-1]
parsed_data = parse_url(url)
platform = parsed_data["platform"]
content_type = parsed_data["type"]
if platform == "dy" and content_type == "video":
yield from process_douyin_video(url, parsed_data)
elif platform == "xhs" and content_type == "video":
yield from process_xhs_video(url, parsed_data)
elif platform == "bili" and content_type == "video":
yield from process_bzhan_video(url, parsed_data)
elif platform == "ks" and content_type == "video":
yield from process_ks_video(url, parsed_data)
else:
res_data = create_res_data("请输入要处理的视频或图文链接", 1)
yield f"data: {json.dumps(res_data, ensure_ascii=False)}\n\n".encode('utf-8')
return
except Exception as e:
error_data = create_res_data(f"请提供视频或者图文链接", 1)
yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n".encode('utf-8')
def extract_mp4_to_text(text: str):
url_pattern = re.compile(r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')
urls = url_pattern.findall(text)
yield from process_urls(urls)
def convert_dialog_format(original_dialog: list) -> list:
converted = []
for item in original_dialog:
if 'user' in item:
converted.append({
'role': 'user',
'content': item['user']
})
elif 'assistant' in item:
converted.append({
'role': 'assistant',
'content': item['assistant']
})
else:
continue
return converted
def get_last_user_value(dialogs: list) -> str or None:
last_user_content = None
for item in dialogs:
if 'user' in item:
last_user_content = item['user']
if last_user_content is not None:
return [{'role': 'user', 'content': last_user_content}]
return None
def transform_image_urls(url_list: List[str]) -> List[Dict]:
return [
{
"type": "image_url",
"image_url": {"url": url}
}
for url in url_list
]
def create_res_data(message, type_):
return {
"message": message,
"type": type_
}
def get_dy_download_images(video_id, image_length):
download_images = []
for i in range(image_length):
download_images.append({
"image_index": i,
"image_url": f"https://static2.douchacha.com/assets/{video_id}/{i}.jpg"
})
return download_images
def get_xhs_download_images(video_id, image_length):
download_images = []
for i in range(image_length):
download_images.append({
"image_index": i,
"image_url": f"https://static2.douchacha.com/xhs/{video_id}/{i}.jpg"
})
return download_images
def comment_process(comments):
commenr_list = []
for comm in comments:
commenr_list.append(comm.get('text'))
comment = [
{
"user": ", ".join(commenr_list)
}
]
return comment
def extract_user_contents(dialogues):
user_contents = []
for item in dialogues:
if 'role' in item and 'content' in item:
if item['role'] == 'user':
user_contents.append(item['content'])
return user_contents
import requests
import urllib.parse
import re
from urllib.parse import quote_plus
"""
快手
"""
def is_valid_kuaishou_profile(url: str) -> bool:
pattern = re.compile(r'''
^https:// # 必须以https协议开头
www\.kuaishou\.com # 固定域名
/f/ # 固定路径前缀
[0-9a-zA-Z_-]{10,20} # 用户ID:10-20位数字、字母、下划线或连字符
$ # 字符串结束
''', re.VERBOSE)
return bool(pattern.match(url))
def get_ks_info(video_url):
url = ""
if is_valid_kuaishou_profile(video_url):
url = f"http://172.16.18.10:8875/api/v1/ks?share_url={quote_plus(video_url)}"
payload={}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
"""
小红书
"""
def is_valid_xiaohongshu_link(url: str) -> bool:
pattern = re.compile(
r'^https://www\.xiaohongshu\.com/(?:discovery|explore)/'
r'(?:item/)?[0-9a-fA-F]{24}\?'
r'(?:[^&]+&)*?'
r'xsec_token=[^&]+'
r'(?:&[^&]+)*?$'
)
xhslink_pattern = re.compile(r'^http://xhslink\.com/m/[A-Za-z0-9]{11}$')
return bool(pattern.match(url) or xhslink_pattern.match(url))
def get_xhs_info(video_url):
url = ""
if is_valid_xiaohongshu_link(video_url):
print(video_url)
url = f"http://172.16.18.10:8875/api/v1/xhs?share_url={quote_plus(video_url)}"
payload = {}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
"""
抖音图文
"""
def get_dy_ocr_info(video_url):
url = ""
pattern1 = r'(?:douyin\.com/(?:video|share/video|note|jingxuan)/|video_id|modal_id=)(\d+)'
match1 = re.search(pattern1, video_url)
if match1:
url = f"http://172.16.18.10:8875/api/v1/ocr?video_id={match1.group(1)}"
pattern = r'https?://v\.douyin\.com/[a-zA-Z0-9-]+/?'
pattern = r'https?://v\.douyin\.com/[a-zA-Z0-9_\-]+/?'
if re.search(pattern, video_url):
url = f"http://172.16.18.10:8875/api/v1/ocr?video_share_url={video_url}"
payload = {}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
def is_valid_bilibili_video(url: str) -> bool:
pattern = re.compile(r'''
^https:// # 必须以https协议开头
www\.bilibili\.com # 固定域名
/video/ # 固定路径
BV[0-9a-zA-Z]{10,30} # BV号(放宽长度限制为10-15位)
/?\? # 允许视频ID后有一个可选的斜杠,再必须跟问号
.+ # 问号后至少有一个参数
$ # 字符串结束
''', re.VERBOSE)
return bool(pattern.match(url))
def get_bilibili_info(video_url):
url = ""
if is_valid_bilibili_video(video_url):
url = f"http://172.16.18.10:8875/api/v1/bilibili?share_url={quote_plus(video_url)}"
payload = {}
headers = {}
if url == "":
return None
response = requests.request("GET", url, headers=headers, data=payload)
return response.json()
if __name__ == '__main__':
# #
# # url= "https://v.kuaishou.com/KFy0JkVY"
# # "29 【要当可爱的水手啦~~~ - 夹克本人 | 小红书 - 你的生活兴趣社区】 😆 tx3lJkm14vxxx1A 😆 https://www.xiaohongshu.com/discovery/item/6894c614000000000403dcad?source=webshare&xhsshare=pc_web&xsec_token=CBmYaPr2HJVyffdsKGD_zkN5gOTLXgasc3dPOzNRGjKhs=&xsec_source=pc_share"
# # #
url= "https://v.douyin.com/MXCfDjwnbBs/ "
result = get_dy_ocr_info(url)
print(result)
# # print(len(result))
\ No newline at end of file
action_content = """
# 角色
你是一个文本判断回复助手,用户将输入内容,你需依据规则判断其对应类别并回复。
# 任务描述与要求
1. 仔细分析用户输入内容,判断其与以下类别之一的相关性:
- 文案提取 → 回复1
- 文案改写 → 回复2
- dso抖音搜索 → 回复3
- 评论分析 → 回复4
2. 判断逻辑优先级:
- 若输入中包含“文案提取”“提取文案”等明确指向文案提取的表述,回复1
- 若输入中包含“文案改写”“改写文案”等明确指向文案改写的表述,回复2
- 若输入中包含“dso抖音搜索”“抖音dso搜索”“抖音搜索”等明确指向dso抖音搜索的表述,回复3
- 若输入中包含“评论分析”“分析评论”等明确指向评论分析的表述,回复4
- 若输入中包含链接时优先返回1
- 若不包含上述表述,再判断是否为“纯平台链接”:仅当输入内容为单一链接(无任何其他字符,包括汉字、符号、空格等),且域名包含抖音、快手、小红书、B站时,回复1
# 相关限制
1. 回复必须严格依据上述规则,不得随意给出其他回复
2. 回复内容只能是1、2、3、4中的一个,保持简洁
"""
last = """
# 角色
你是一个文本判断回复助手,用户将输入与ai的对话历史
# 任务描述与要求
# 仔细分析用户输入内容,判断其与文案提取、文案改写、dso抖音搜索、评论分析的相关性。
# 若与文案提取相关,回复:1
# 若与文案改写相关,回复:2
# 若与dso抖音搜索相关,回复:3
# 若与评论分析相关,回复:4
# 相关限制
用户只输入了一个链接并且域名包含抖音、快手、小红书、b站并且没有任何汉字时回复:1
1. 回复必须准确依据规则,不能随意给出其他回复。
2. 回复内容必须简洁,只能是1、2、3、4
"""
douyin_text_fix = """
# 角色
你是一位专业的抖音短视频脚本改写专家,能够在遵循原文逻辑的基础上,
对给定的短视频脚本进行改写。要以自然、简洁的口语化方式表达,模仿真人说话的语气,
坚决不使用排比句以及复杂的词汇。同时,务必保留人物名、时间、地点、数字、政策名称等核心细节,保证原文关键信息不改变。
## 技能
### 技能 1: 抖音短视频脚本改写
1. 接收用户提供的抖音短视频脚本后,认真研读脚本内容,精准把握原文逻辑。
2. 逐句对脚本进行改写,使用简单易懂、贴近生活的口语化表述,摒弃排比句和复杂生僻的词汇。
3. 完成改写后,仔细核对,确保所有核心细节及关键信息与原文一致。
## 限制
- 仅围绕抖音短视频脚本改写提供服务,不回答与脚本改写无关的问题。
- 改写后的脚本必须严格遵循原文逻辑,核心细节和关键信息不得有误。
- 表述要口语化、自然、简洁,杜绝使用排比句和复杂词汇。
"""
helpful_content_text_rag = """
# 角色
你是爱搜灵犀北京爱查查开发的智能助手,你将根据用户的各种咨询需求,依据既定知识和逻辑,一步步为用户提供准确且有用的回答。
# 任务描述与要求
1. 对于用户提出的问题,需深入理解其意图,从多维度分析并给出全面的回答。
2. 回答要简洁明了,避免使用过于复杂的语言和句式,确保用户能轻松理解。
3. 若涉及专业知识,要在解释清楚的基础上,结合通俗易懂的例子辅助说明。
# 相关限制
1. 回答需基于准确的知识,不能随意编造信息。
2. 避免给出模棱两可、没有明确结论的回答。
3. 语言风格要保持友好、专业,不能使用不当或冒犯性的词汇。
不知道答案时返回 我还不太清楚你要表达什么意思
"""
helpful_content_text = """
# 角色
你是爱搜灵犀北京爱查查开发的智能助手,你将根据用户的各种咨询需求,依据既定知识和逻辑,一步步为用户提供准确且有用的回答。
# 任务描述与要求
1. 对于用户提出的问题,需深入理解其意图,从多维度分析并给出全面的回答。
2. 回答要简洁明了,避免使用过于复杂的语言和句式,确保用户能轻松理解。
3. 若涉及专业知识,要在解释清楚的基础上,结合通俗易懂的例子辅助说明。
# 相关限制
1. 回答需基于准确的知识,不能随意编造信息。
2. 避免给出模棱两可、没有明确结论的回答。
3. 语言风格要保持友好、专业,不能使用不当或冒犯性的词汇。
不知道答案时返回 我还不太清楚你要表达什么意思
## 限制 当用户问题是一个url 并且与文案提取相关时 回答链接格式错误 并且返回正确url链接样式
"""
comment_content= """
请根据提供的视频评论内容,生成一份结构清晰、内容详实的分析报告。报告需包含以下核心板块:
核心讨论信息与热点话题:分别提炼出核心讨论信息和热点话题。核心讨论信息要围绕评论中反复提及的关键表述、相关概念对比及用户重点关注的语言特征等方面;热点话题需聚焦用户对评论内容迭代、不同场景下评论表述差异等的反馈。
核心关键词:汇总评论中出现的与评论内容本身、用户表达感受、相关概念对比等紧密相关的关键词汇。
讨论内容情感分布:按正面、负面、中性情感分类,说明各情感占比,并列举相应的典型评论内容,体现不同情感的核心表达。
与评论相关的观点分析:从正面观点和负面观点两方面阐述,正面观点涵盖评论内容的优势、表述清晰度、情感传递效果等;负面观点指出评论内容存在的问题,同时挖掘潜在的优化机会点。
总结:概括评论内容的整体舆情概况,提炼重点信息,包括评论内容的差异化特征、引发关注的核心因素以及需要改进或明确的表述方面等。
整体报告需逻辑连贯,语言简洁明了,能清晰呈现视频评论本身的关键信息和用户对评论的态度。
返回文字就可以 不需要 样式
"""
# -*- coding: utf-8 -*-
import time
import tos
from io import StringIO
import json
TOS_AK = "AKLTMmRiMmU3YmY5ZjZjNDZkMTlhMmQxY2JkYTllYTQzNDI"
TOS_SK = "WkdWak4yUTRNakl3WVdOa05HUXdaR0V4TlRBM1l6YzJZMll5WkRnMFlUTQ=="
TOS_ENDPOINT = "tos-cn-beijing.volces.com"
TOS_REGION = "cn-beijing"
TOS_BUCKET_NAME = "douchacha-web"
def check_file_in_tos(object_key, retry_times=1000):
retry_count = 0
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
while retry_count <= retry_times:
try:
client.head_object(bucket=TOS_BUCKET_NAME, key=object_key)
return True
except (tos.exceptions.TosClientError,
tos.exceptions.TosServerError,
Exception) as e:
if isinstance(e, tos.exceptions.TosServerError) and e.status_code == 404:
retry_count += 1
time.sleep(1)
if retry_count > retry_times:
return False
def check_file_in_tos_buff(object_key):
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
try:
client.head_object(bucket=TOS_BUCKET_NAME, key=object_key)
return True
except (tos.exceptions.TosClientError,
tos.exceptions.TosServerError,
Exception) as e:
return False
def put_string_to_tos( object_key, content, retry_times=3):
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
for _ in range(retry_times):
try:
client.put_object(TOS_BUCKET_NAME, object_key, content=json.dumps(content))
return True
except Exception as e:
print(e)
continue
return False
def get_string_from_tos( object_key, retry_times=3):
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
for _ in range(retry_times):
try:
response = client.get_object(TOS_BUCKET_NAME,object_key)
content = response.read()
return json.loads(content)
except tos.exceptions.TosServerError as e:
if e.status_code == 404:
print(f"文件不存在: {object_key}")
return None
except (tos.exceptions.TosClientError, json.JSONDecodeError) as e:
print(f"JSON解析失败")
return False
#
# if __name__ == '__main__':
#
import json
from flask import request,Flask, jsonify
import redis
redis_client = redis.Redis(host='172.16.0.24', port=6379,db=8, password='aiyingli@@123', socket_timeout=10,decode_responses=True)
app = Flask(__name__)
@app.route('/dso/video_gen', methods=['POST'])
def video_gen():
data = request.get_json()
redis_client.lpush('dso_video_gen', json.dumps(data))
return jsonify({
"code": 200,
"data":'',
"msg": "数据接收成功"
})
@app.route('/dso/image_gen', methods=['POST'])
def image_gen():
data = request.get_json()
redis_client.lpush('dso_image_gen', json.dumps(data))
return jsonify({
"code": 200,
"data":'',
"msg": "数据接收成功"
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8086, debug=True)
\ No newline at end of file
# -*- coding: utf-8 -*-
import requests
import json
import os, sys
import redis
from concurrent.futures import ThreadPoolExecutor, as_completed
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(base_path)
import tos_util
redis_client = redis.Redis(host='172.16.0.24', port=6379, db=8, password='aiyingli@@123', socket_timeout=10,
decode_responses=True)
doubao_api_token = 'Bearer fcc424e5-58af-494d-9683-5787413a26c9'
dir = '/Users/yaowentong/Desktop/image/'
tos_dir = ''
size_map = {
'1:1': '2048x2048',
'4:3': '2304x1728',
'3:4': '1728x2304',
'16:9': '2560x1440',
'9:16': '1440x2560',
'3:2': '2496x1664',
'2:3': '1664x2496',
'21:9': '3024x1296',
}
def gen_image(prompt, size,task_id):
url = "https://ark.cn-beijing.volces.com/api/v3/images/generations"
payload = json.dumps({
"model": "doubao-seedream-4-0-250828",
"prompt": prompt,
"size": size_map[size],
"sequential_image_generation": "disabled",
"sequential_image_generation_options": {
"max_images": 1
},
"stream": False,
"response_format": "url",
"watermark": False
})
headers = {
'Authorization': doubao_api_token,
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload, timeout=10000)
return response.json().get('data')
except Exception as e:
json_url = f'dso/ai_gen/single/{task_id}/info.json'
tos_util.put_string_to_tos(f'{json_url}',
["异常:你输入的文字不符合制作规则,请修改后重试"])
print(f"下载失败:{str(e)}")
def download_image(url, save_path):
try:
# 发送 GET 请求获取图片内容
response = requests.get(url, stream=True)
response.raise_for_status() # 检查请求是否成功
# 提取保存路径中的目录部分
save_dir = os.path.dirname(save_path)
# 若目录不存在则创建(包括多级目录)
if save_dir and not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True) # exist_ok=True 避免目录已存在时报错
# 写入文件
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk: # 过滤空块
file.write(chunk)
print(f"图片已成功下载至:{save_path}")
except Exception as e:
print(f"下载失败:{str(e)}")
def process_gen_image(prompt, ratio, task_id):
# url_list = []
# for i in range(4):
# url_list_result = gen_image(prompt, ratio)
# url_list.append(url_list_result[0])
# url_list_json = []
# for index, item in enumerate(url_list):
# save_path = f'{dir}{task_id}/{index}.jpeg'
# download_image(item['url'], save_path)
# tos_util.put_video(f'dso/ai_gen/single/{task_id}/{index}.jpeg', save_path)
# url_list_json.append(f'https://static2.douchacha.com/dso/ai_gen/single/{task_id}/{index}.jpeg')
#
# json_url = f'dso/ai_gen/single/{task_id}/info.json'
# tos_util.put_string_to_tos(f'{json_url}', url_list_json)
# 使用线程池并发执行4次请求
url_list = []
with ThreadPoolExecutor(max_workers=4) as executor:
# 提交4个任务
futures = [executor.submit(gen_image, prompt, ratio,task_id) for _ in range(4)]
# 获取执行结果(按完成顺序)
for future in as_completed(futures):
url_list_result = future.result()
url_list.append(url_list_result[0])
# 后续逻辑保持不变
url_list_json = []
for index, item in enumerate(url_list):
save_path = f'{dir}{task_id}/{index}.jpeg'
download_image(item['url'], save_path)
tos_util.put_video(f'dso/ai_gen/single/{task_id}/{index}.jpeg', save_path)
url_list_json.append(f'https://static2.douchacha.com/dso/ai_gen/single/{task_id}/{index}.jpeg')
json_url = f'dso/ai_gen/single/{task_id}/info.json'
tos_util.put_string_to_tos(f'{json_url}', url_list_json)
# os.remove(f'{dir}{task_id}/')
# # 保存路径(可自定义,这里保存为当前目录下的 image.jpg)
# save_path = f'{dir}image.jpeg'
#
# # 调用下载函数
#
if __name__ == '__main__':
task = redis_client.rpop('dso_image_gen_task')
try:
task_json = json.loads(task)
prompt = task_json['prompt']
ratio = task_json['ratio']
task_id = task_json['task_id']
process_gen_image(prompt, ratio, task_id)
except Exception as err:
# redis_client.rpush('dso_image_gen', task)
print(f'{err}')
# prompt = '情趣内衣'
# ratio = '16:9'
# task_id = 'aaa'
# process_gen_image(prompt, ratio, task_id)
import json
import redis
import subprocess
import time
import os
import psutil # 用于通过PID检查进程是否存活
def main():
# Redis连接配置(根据实际情况修改)
redis_host = '172.16.0.24'
redis_port = 6379
redis_db = 8
redis_password = 'aiyingli@@123'
redis_list_key = 'dso_image_gen' # 存储任务的list键名
redis_list_key_task = 'dso_image_gen_task' # 存储任务的list键名
# 最大并发数
max_concurrent = 10
try:
# 连接Redis
r = redis.Redis(
host=redis_host,
port=redis_port,
db=redis_db,
password=redis_password,
decode_responses=True # 自动将返回值解码为字符串
)
r.ping()
print("成功连接到Redis")
except Exception as e:
print(f"Redis连接失败: {e}")
return
# 存储当前运行的实际任务进程PID(而非shell进程)
task_pids = []
try:
print("开始监控任务队列... (按Ctrl+C停止)")
while True:
# 清理已结束的任务进程(检查PID是否存活)
active_pids = []
for pid in task_pids:
if psutil.pid_exists(pid):
print(f"任务进程PID {pid} 正在执行")
# 进程仍在运行
active_pids.append(pid)
else:
print(f"任务进程PID {pid} 已结束")
task_pids = active_pids # 更新存活的PID列表
# 检查是否可以启动新任务
if len(task_pids) < max_concurrent:
# 从Redis列表左侧获取一个任务
task = r.rpop(redis_list_key)
if task:
# print(f"获取到新任务: {json.loads(task)}")
task = r.lpush(redis_list_key_task,task)
try:
# 启动任务并捕获实际进程的PID
# 命令说明:
# 1. 用nohup启动任务,将PID写入临时文件
# 2. 从临时文件读取PID并记录
pid_file = f"/tmp/video_task_{int(time.time())}.pid"
cmd = (
f'nohup python3.9 ./image_gen_process.py'
f'> /dev/null 2>&1 & echo $! > {pid_file}'
)
# 执行命令(启动任务并写入PID)
process = subprocess.Popen(
cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# 等待命令执行完成(确保PID文件已生成)
stdout, stderr = process.communicate()
if stderr:
raise Exception(f"命令执行错误: {stderr}")
# 读取PID文件获取实际任务的PID
if os.path.exists(pid_file):
with open(pid_file, 'r') as f:
pid = int(f.read().strip())
os.remove(pid_file) # 清理临时文件
task_pids.append(pid)
print(f"启动任务进程(PID: {pid}),当前并发数: {len(task_pids)}/{max_concurrent}")
else:
raise Exception("未生成PID文件,无法获取任务进程ID")
except Exception as e:
print(f"启动任务失败: {e}")
r.rpush(redis_list_key, task) # 失败时放回队列
time.sleep(5)
else:
# 没有任务时短暂休息
time.sleep(5)
else:
# 达到最大并发数,等待10秒
print(f"已达到最大并发数 {max_concurrent},等待10秒...")
time.sleep(10)
except KeyboardInterrupt:
print("\n收到停止信号,正在等待所有任务结束...")
# 等待所有任务进程结束(可选:如果需要强制终止,可调用psutil.Process(pid).terminate())
for pid in task_pids:
if psutil.pid_exists(pid):
psutil.Process(pid).wait() # 等待进程自然结束
print(f"任务进程PID {pid} 已终止")
print("所有任务已处理完毕,程序退出")
if __name__ == "__main__":
main()
\ No newline at end of file
import os
import tos
import json
TOS_AK = "AKLTMmRiMmU3YmY5ZjZjNDZkMTlhMmQxY2JkYTllYTQzNDI"
TOS_SK = "WkdWak4yUTRNakl3WVdOa05HUXdaR0V4TlRBM1l6YzJZMll5WkRnMFlUTQ=="
TOS_ENDPOINT = "tos-cn-beijing.volces.com"
TOS_REGION = "cn-beijing"
TOS_BUCKET_NAME = "douchacha-web"
def put_video(object_key,file_name):
try:
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION)
client.put_object_from_file(TOS_BUCKET_NAME, object_key, file_name)
except tos.exceptions.TosClientError as e:
print('fail with client error, message:{}, cause: {}'.format(e.message, e.cause))
def put_string_to_tos( object_key, content, retry_times=3):
client = tos.TosClientV2(TOS_AK, TOS_SK, TOS_ENDPOINT, TOS_REGION, max_retry_count=3)
for _ in range(retry_times):
try:
json_str = json.dumps(content, ensure_ascii=False)
client.put_object(TOS_BUCKET_NAME, object_key, content=json_str.encode('utf-8'))
return True
except Exception as e:
print(e)
continue
return False
import os
import random
import moviepy.editor as mp
import math
font_path = '/Users/yaowentong/Desktop/video/mjt.ttf'
# bottom_font_path = '/Users/yaowentong/Desktop/ALIBABA-PUHUITI-MEDIUM.TTF'
os.environ["FFMPEG_BINARY"] = "/Users/yaowentong/Downloads/ffmpeg"
def gen_9_16_video(
input_video,
output_video,
text_content,
text_content_color,
overlay_color,
title_overlay_opacity,
font_size,
vertical_font_size,
bottom_text_content,
bottom_text_content_color,
bottom_overlay_opacity,
bottom_font_size,
bottom_stroke_color,
bottom_stroke_width,
background_music=None, # 背景音乐文件路径(可选)
bgm_volume=0.5 # 背景音乐音量(0-1之间)
):
video = mp.VideoFileClip(input_video)
w, h = video.size # 视频宽高(w=宽度,h=高度)
video_duration = video.duration
# -------------------------- 核心修改:音频处理逻辑 --------------------------
original_audio = video.audio
if original_audio is not None:
# 原视频有音频:确保音频时长与视频一致(防止原音频时长不匹配)
final_audio = original_audio.subclip(0, video_duration)
else:
# 原视频无音频:检查是否提供背景音乐
if background_music:
try:
# 加载背景音乐并统一格式(避免编码问题)
bgm = mp.AudioFileClip(background_music).set_duration(video_duration)
video = video.set_audio(bgm)
except Exception as e:
print(f"加载背景音乐失败:{str(e)}") # 捕获错误(如文件不存在、格式不支持)
final_audio = None
else:
final_audio = None
# -------------------------- 1. 顶部蒙层 -----------------------------------------------------
yellow_overlay_h = math.ceil(h * 0.16)
yellow_overlay_y = math.ceil(h * 0.11)
yellow_overlay = mp.ColorClip(
size=(w, yellow_overlay_h),
color=overlay_color # 黄色
)
yellow_overlay = yellow_overlay.set_opacity(title_overlay_opacity).set_duration(video.duration)
yellow_overlay = yellow_overlay.set_position((0, yellow_overlay_y))
# -------------------------- 2. 顶部文字 -----------------------------------------------------
# 计算文字留白(左右、上下均为视频宽度的6%)
padding = math.ceil(w * 0.04) # 统一留白值
# 文字区域的有效宽度和高度(黄色蒙层尺寸减去留白)
text_area_width = w - 2 * padding # 左右各减一个留白
text_area_height = yellow_overlay_h - 2 * padding # 上下各减一个留白
vertical_stroke_color = 'black'
vertical_stroke_width = 2
text_clip = mp.TextClip(
text_content,
color=text_content_color,
font=font_path,
fontsize=font_size,
align='Center',
size=(text_area_width, text_area_height),
stroke_color=vertical_stroke_color,
stroke_width=vertical_stroke_width
)
# 设置文字位置:相对于黄色蒙层左上角偏移padding(即留白)
# 整体位置 = 黄色蒙层位置 + 留白偏移
text_x = padding # 左边留白
text_y = yellow_overlay_y + padding # 上边留白(基于黄色蒙层的Y坐标)
text_clip = text_clip.set_position((text_x, text_y)).set_duration(video.duration)
# --------------------------3. 底部白色蒙层 --------------------------
margin_x = math.ceil(w * 0.06) # 单边边距(视频宽度的10%)
bottom_overlay_w = w - 2 * margin_x # 蒙层宽度 = 视频宽度 - 左右两边距
bottom_overlay_x = margin_x # 蒙层X坐标 = 左边距(即视频宽度的10%)
# 保持原有的高度和Y坐标计算
bottom_overlay_h = math.ceil(h * 0.64)
bottom_overlay_y = math.ceil(h * 0.30)
# 创建底部白色蒙层(后续代码不变)
white_overlay = mp.ColorClip(
size=(bottom_overlay_w, bottom_overlay_h),
color=(255, 255, 255)
)
white_overlay = white_overlay.set_opacity(bottom_overlay_opacity).set_duration(video.duration)
white_overlay = white_overlay.set_position((bottom_overlay_x, bottom_overlay_y))
# -------------------------- 4. 仅供参考 --------------------------
vertical_text = "视频内容仅供参考"
vertical_stroke_color = 'black'
vertical_stroke_width = 1
vertical_text_processed = '\n'.join(vertical_text)
# 创建竖排文字(通过旋转-90度实现竖排效果)
vertical_text_clip = mp.TextClip(
vertical_text_processed,
color='white', # 用红色突出显示提示文字
font=font_path,
align='center',
fontsize=vertical_font_size,
stroke_color=vertical_stroke_color,
stroke_width=vertical_stroke_width
)
# 计算竖排文字位置(左下角,距离边缘保留留白)
vertical_padding = math.ceil(w * 0.02) # 竖排文字额外留白
# X坐标:底部蒙层左边界 + 留白(旋转后文字宽度变为高度,需调整定位)
vertical_x = bottom_overlay_x + vertical_padding
# Y坐标:底部蒙层下边界 - 文字宽度(旋转后高度变宽度) - 留白
vertical_y = math.ceil(h * 0.70)
vertical_text_clip = vertical_text_clip.set_position(
(vertical_x, vertical_y)
).set_duration(video.duration)
# -------------------------- 5. 底部文字 --------------------------
# 底部文字参数(可根据需求调整内容、字号、颜色)
bottom_padding = math.ceil(w * 0.06) # 底部文字留白(与顶部一致,均为w*0.06)
# 底部文字有效区域(底部蒙层尺寸 - 上下左右留白)
bottom_text_area_width = bottom_overlay_w - 2 * bottom_padding # 左右各减留白
bottom_text_area_height = bottom_overlay_h - 2 * bottom_padding # 上下各减留白
# 创建底部文字片段(自动换行、居中对齐,与顶部逻辑一致)
bottom_text_clip = mp.TextClip(
bottom_text_content,
color=bottom_text_content_color,
font=font_path, # 复用字体路径
fontsize=bottom_font_size,
align='Center', # 文字换行后仍居中
size=(bottom_text_area_width, bottom_text_area_height), # 限制在有效区域内
stroke_color=bottom_stroke_color, # 核心:文字描边颜色
stroke_width=bottom_stroke_width # 核心:文字描边宽度
)
# 底部文字位置(相对于底部蒙层左上角偏移留白,与顶部逻辑一致)
bottom_text_x = bottom_overlay_x + bottom_padding # 底部蒙层X坐标 + 左边留白
bottom_text_y = bottom_overlay_y + bottom_padding # 底部蒙层Y坐标 + 上边留白
bottom_text_clip = bottom_text_clip.set_position((bottom_text_x, bottom_text_y)).set_duration(video.duration)
# -------------------------- 5. 合成视频 --------------------------
final_video = mp.CompositeVideoClip([
# video, yellow_overlay, top_text_clip, white_overlay, bottom_text_clip
video, yellow_overlay, text_clip, white_overlay, vertical_text_clip, bottom_text_clip
])
# -------------------------- 6. 输出视频 --------------------------
final_video.write_videofile(
output_video,
codec='libx264',
audio_codec='aac',
fps=video.fps
)
# 释放资源
video.close()
final_video.close()
def gen_16_9_video(
input_video,
output_video,
text_content,
text_content_color,
overlay_color,
title_overlay_opacity,
font_size,
vertical_font_size,
bottom_text_content,
bottom_text_content_color,
bottom_overlay_opacity,
bottom_font_size,
bottom_stroke_color,
bottom_stroke_width,
background_music=None, # 背景音乐文件路径(可选)
bgm_volume=0.5 # 背景音乐音量(0-1之间)
):
video = mp.VideoFileClip(input_video)
w, h = video.size # 视频宽高(w=宽度,h=高度)
video_duration = video.duration
# -------------------------- 核心修改:音频处理逻辑 --------------------------
original_audio = video.audio
if original_audio is not None:
# 原视频有音频:确保音频时长与视频一致(防止原音频时长不匹配)
final_audio = original_audio.subclip(0, video_duration)
else:
# 原视频无音频:检查是否提供背景音乐
if background_music:
try:
# 加载背景音乐并统一格式(避免编码问题)
bgm = mp.AudioFileClip(background_music).set_duration(video_duration)
video = video.set_audio(bgm)
except Exception as e:
print(f"加载背景音乐失败:{str(e)}") # 捕获错误(如文件不存在、格式不支持)
final_audio = None
else:
final_audio = None
# -------------------------- 1. 顶部蒙层 -----------------------------------------------------
yellow_overlay_h = math.ceil(h * 0.16)
yellow_overlay_y = math.ceil(h * 0.11)
yellow_overlay = mp.ColorClip(
size=(w, yellow_overlay_h),
color=overlay_color # 黄色
)
yellow_overlay = yellow_overlay.set_opacity(title_overlay_opacity).set_duration(video.duration)
yellow_overlay = yellow_overlay.set_position((0, yellow_overlay_y))
# -------------------------- 2. 顶部文字 -----------------------------------------------------
# 计算文字留白(左右、上下均为视频宽度的6%)
padding = math.ceil(h * 0.01) # 统一留白值
# 文字区域的有效宽度和高度(黄色蒙层尺寸减去留白)
text_area_width = w - 2 * padding # 左右各减一个留白
text_area_height = yellow_overlay_h - padding # 上下各减一个留白
vertical_stroke_color = 'black'
vertical_stroke_width = 2
text_clip = mp.TextClip(
text_content,
color=text_content_color,
font=font_path,
fontsize=font_size,
align='Center',
size=(text_area_width, text_area_height),
stroke_color=vertical_stroke_color,
stroke_width=vertical_stroke_width
)
# 设置文字位置:相对于黄色蒙层左上角偏移padding(即留白)
# 整体位置 = 黄色蒙层位置 + 留白偏移
text_x = padding # 左边留白
text_y = yellow_overlay_y + padding # 上边留白(基于黄色蒙层的Y坐标)
text_clip = text_clip.set_position((text_x, text_y)).set_duration(video.duration)
# --------------------------3. 底部白色蒙层 --------------------------
margin_x = math.ceil(w * 0.06) # 单边边距(视频宽度的10%)
bottom_overlay_w = w - 2 * margin_x # 蒙层宽度 = 视频宽度 - 左右两边距
bottom_overlay_x = margin_x # 蒙层X坐标 = 左边距(即视频宽度的10%)
# 保持原有的高度和Y坐标计算
bottom_overlay_h = math.ceil(h * 0.64)
bottom_overlay_y = math.ceil(h * 0.30)
# 创建底部白色蒙层(后续代码不变)
white_overlay = mp.ColorClip(
size=(bottom_overlay_w, bottom_overlay_h),
color=(255, 255, 255)
)
white_overlay = white_overlay.set_opacity(bottom_overlay_opacity).set_duration(video.duration)
white_overlay = white_overlay.set_position((bottom_overlay_x, bottom_overlay_y))
# -------------------------- 4. 仅供参考 --------------------------
vertical_text = "视频内容仅供参考"
vertical_stroke_color = 'black'
vertical_stroke_width = 1
vertical_text_processed = '\n'.join(vertical_text)
# 创建竖排文字(通过旋转-90度实现竖排效果)
vertical_text_clip = mp.TextClip(
vertical_text_processed,
color='white', # 用红色突出显示提示文字
font=font_path,
align='center',
fontsize=vertical_font_size,
stroke_color=vertical_stroke_color,
stroke_width=vertical_stroke_width
)
# 计算竖排文字位置(左下角,距离边缘保留留白)
vertical_padding = math.ceil(w * 0.02) # 竖排文字额外留白
# X坐标:底部蒙层左边界 + 留白(旋转后文字宽度变为高度,需调整定位)
vertical_x = bottom_overlay_x + vertical_padding
# Y坐标:底部蒙层下边界 - 文字宽度(旋转后高度变宽度) - 留白
vertical_y = math.ceil(h * 0.60)
vertical_text_clip = vertical_text_clip.set_position(
(vertical_x, vertical_y)
).set_duration(video.duration)
# -------------------------- 5. 底部文字 --------------------------
# 底部文字参数(可根据需求调整内容、字号、颜色)
bottom_padding = math.ceil(w * 0.06) # 底部文字留白(与顶部一致,均为w*0.06)
# 底部文字有效区域(底部蒙层尺寸 - 上下左右留白)
bottom_text_area_width = bottom_overlay_w - 2 * bottom_padding # 左右各减留白
bottom_text_area_height = bottom_overlay_h - 2 * bottom_padding # 上下各减留白
# 创建底部文字片段(自动换行、居中对齐,与顶部逻辑一致)
bottom_text_clip = mp.TextClip(
bottom_text_content,
color=bottom_text_content_color,
font=font_path, # 复用字体路径
fontsize=bottom_font_size,
align='Center', # 文字换行后仍居中
size=(bottom_text_area_width, bottom_text_area_height), # 限制在有效区域内
stroke_color=bottom_stroke_color, # 核心:文字描边颜色
stroke_width=bottom_stroke_width # 核心:文字描边宽度
)
# 底部文字位置(相对于底部蒙层左上角偏移留白,与顶部逻辑一致)
bottom_text_x = bottom_overlay_x + bottom_padding # 底部蒙层X坐标 + 左边留白
bottom_text_y = bottom_overlay_y + bottom_padding # 底部蒙层Y坐标 + 上边留白
bottom_text_clip = bottom_text_clip.set_position((bottom_text_x, bottom_text_y)).set_duration(video.duration)
# -------------------------- 5. 合成视频 --------------------------
final_video = mp.CompositeVideoClip([
# video, yellow_overlay, top_text_clip, white_overlay, bottom_text_clip
video, yellow_overlay, text_clip, white_overlay, vertical_text_clip, bottom_text_clip
])
# -------------------------- 6. 输出视频 --------------------------
final_video.write_videofile(
output_video,
codec='libx264',
audio_codec='aac',
fps=video.fps
)
# 释放资源
video.close()
final_video.close()
# -------------------------- 示例调用 --------------------------
if __name__ == "__main__":
print('aaa')
# list = [((53, 53, 53), '#ff890c', '#ff890c'),
# ((27, 95, 171), '#fbf2e1', '#fbf2e1'),
# ((253, 255, 239), '#53579f', '#53579f'),
# ((30, 67, 50), '#fffbf0', '#fffbf0'),
# ((198, 77, 52), '#e6f1d9', '#e6f1d9'),
# ]
#
# color_list = random.choice(list)
# print(color_list)
# # 1080 90 40 85
# # 720 sora 60 25 55
# gen_16_9_video(
# input_video="/Users/yaowentong/Desktop/video/1111.mp4",
# output_video="/Users/yaowentong/Desktop/video/1111_result.mp4",
# # 文字内容
# text_content='月子餐怎么做',
# text_content_color=color_list[1],
# overlay_color=color_list[0],
# font_size=50,
# vertical_font_size=20,
# bottom_font_size=30,
# bottom_text_content="""
# 猪肝 100 克菠菜 200 克猪肝 100 克菠菜 200 克猪肝 100 克
# 鸡蛋 2 个米酒 100 毫升猪肝 100 克菠菜 200 克 克猪肝 100 克
# 生姜 10 克盐 少许 克猪肝 100 克 克猪肝 100 克 克猪肝 100 克
# 油 适量水 适量 克猪肝 100 克 克猪肝 100 克 克猪肝 100 克
# 油 适量水 适量 克猪肝 100 克 克猪肝 100 克 克猪肝 100 克
# 油 适量水 适量 克猪肝 100 克 克猪肝 100 克 克猪肝 100 克 11
# """,
# bottom_text_content_color=color_list[1],
# background_music='/Users/yaowentong/Desktop/Backbeat.mp3'
# )
# gen_16_9_video( input_video="/Users/yaowentong/Desktop/720_16_9.mp4")
# -*- coding: utf-8 -*-
import requests
import json
import time
from urllib.parse import urlparse
import os, sys
import cv2
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(base_path)
import video_gen
import tos_util
import redis
import random
doubao_api_token = 'Bearer fcc424e5-58af-494d-9683-5787413a26c9'
sora_api_token = 'Bearer ade2547a49722db6375545e4123e2088'
dir = '/Users/yaowentong/Desktop/video/'
redis_client = redis.Redis(host='172.16.0.24', port=6379, db=8, password='aiyingli@@123', socket_timeout=10,
decode_responses=True)
# (标题背景颜色,标题文字颜色,正文文字颜色,标题透明度,正文透明度,描边颜色,描边大小)
list = [
((255, 213, 75), '#000000', '#FFDE00', 1, 0.2, 'black', 1), # 标题黄底 黑字 正文黄字
# (( 255 ,215, 77), '#000000', '#000000',1,1,'black',0), # 标题黄底 黑字 正文黄字
((255, 255, 255), '#FEDE01', '#BE3030', 0, 0.2, 'white', 1), # 标题黄底 黑字 正文黄字
((255, 255, 255), '#FEE001', '#FFFFFF', 0, 0.2, 'black', 1), # 标题黄底 黑字 正文黄字
]
def extract_first_frame(video_path, output_image_path):
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"无法打开视频文件: {video_path}")
return False
ret, frame = cap.read()
if ret:
# 保存第一帧为图片(支持格式:.jpg, .png 等)
cv2.imwrite(output_image_path, frame)
print(f"第一帧已保存至: {output_image_path}")
else:
print("无法读取视频的第一帧")
# 释放资源
cap.release()
return ret
def sora_generate_video(prompt, video_duration, ratio):
# portrait | landscape
url = "https://api.kie.ai/api/v1/jobs/createTask"
payload = json.dumps({
"model": "sora-2-text-to-video-stable",
"input": {
"prompt": prompt,
"aspect_ratio": ratio,
"n_frames": str(video_duration),
"remove_watermark": True
}
})
headers = {
'Authorization': sora_api_token,
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
return response.json().get('data').get('taskId')
except Exception as err:
print(f'{err}')
def sora_search_video(task_id):
url = f"https://api.kie.ai/api/v1/jobs/recordInfo?taskId={task_id}"
payload = {}
headers = {
'Authorization': sora_api_token
}
response = requests.request("GET", url, headers=headers, data=payload)
try:
return response.json()
except Exception as err:
print(f'{err}')
def sora_get_video_url(task_id: str, file_name, max_retries: int = 100) -> str:
retry_count = 0
while retry_count < max_retries:
try:
# 发送请求查询任务状态
task_data = sora_search_video(task_id)['data']
# 状态判断
if task_data["state"] == "success":
print("任务成功,获取视频链接")
return json.loads(task_data['resultJson']).get('resultUrls')[0] # 返回视频链接
# todo
elif task_data["state"] == "generating" or task_data["state"] == "waiting":
retry_count += 1
wait_time = 3
print(f"任务运行中,{wait_time}秒后重试(第{retry_count}/{max_retries}次)")
time.sleep(wait_time)
else:
if task_data["state"] == 'fail' and task_data["failCode"] == 500:
json_url = f'dso/ai_gen/video/{file_name}/info.json'
tos_util.put_string_to_tos(f'{json_url}',
["异常:网络异常,请稍后再试"])
else:
json_url = f'dso/ai_gen/video/{file_name}/info.json'
tos_util.put_string_to_tos(f'{json_url}',
["异常:你输入的文字不符合制作规则,请修改后重试"])
print(f"任务异常,状态:{task_data['state']} task:{task_id}")
# 处理其他异常状态(如failed)
raise Exception(f"任务异常,状态:{task_data['state']} task:{task_id}")
except requests.exceptions.RequestException as e:
raise Exception(f"请求失败:{str(e)}")
json_url = f'dso/ai_gen/video/{file_name}/info.json'
tos_util.put_string_to_tos(f'{json_url}', ["异常:网络异常,请稍后再试"])
def doubao_generate_video(text, duration, ratio):
url = "https://ark.cn-beijing.volces.com/api/v3/contents/generations/tasks"
payload = json.dumps({
"model": "doubao-seedance-1-0-pro-250528",
"content": [
{
"type": "text",
"text": f"{text} --resolution 720p --duration {duration} --camerafixed false --watermark false --ratio {ratio}"
}
]
})
headers = {
'Authorization': doubao_api_token,
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
return response.json().get('id')
except Exception as err:
print(f'{err}')
def doubao_search_video(task_id):
url = f"https://ark.cn-beijing.volces.com/api/v3/contents/generations/tasks/{task_id}"
payload = ""
headers = {
'Authorization': doubao_api_token,
'Content-Type': 'application/json'
}
response = requests.request("GET", url, headers=headers, data=payload)
try:
return response.json()
except Exception as err:
print(f'{err}')
def doubao_get_video_url(task_id: str, file_name, max_retries: int = 500) -> str:
doubao_search_video(task_id)
retry_count = 0
while retry_count < max_retries:
try:
# 发送请求查询任务状态
task_data = doubao_search_video(task_id)
# 状态判断
if task_data["status"] == "succeeded":
print("任务成功,获取视频链接")
print(task_data["content"]["video_url"])
return task_data["content"]["video_url"] # 返回视频链接
elif task_data["status"] == "running" or "queued":
retry_count += 1
wait_time = 3
print(f"任务运行中,{wait_time}秒后重试(第{retry_count}/{max_retries}次)")
time.sleep(wait_time)
else:
json_url = f'dso/ai_gen/video/{file_name}/info.json'
tos_util.put_string_to_tos(f'{json_url}',
[])
print(f"任务异常,状态:{task_data['status']} task:{task_id}")
# 处理其他异常状态(如failed)
raise Exception(f"任务异常,状态:{task_data['status']} task:{task_id}")
except requests.exceptions.RequestException as e:
raise Exception(f"请求失败:{str(e)}")
def download_video(video_url: str, save_dir: str = "./downloads", file_name: str = None) -> None:
if save_dir and not os.path.exists(save_dir):
os.makedirs(save_dir, exist_ok=True)
if not file_name:
parsed_url = urlparse(video_url)
file_name = os.path.basename(parsed_url.path)
# 确保文件名以.mp4结尾(防止URL特殊情况)
if not file_name.endswith(".mp4"):
file_name += ".mp4"
save_path = os.path.join(save_dir, file_name)
try:
# 2. 发送请求获取视频流(stream=True避免一次性加载到内存)
response = requests.get(video_url, stream=True, timeout=30)
response.raise_for_status() # 若HTTP状态码非200(如404/500),抛出异常
# 4. 写入文件(按字节流写入,支持大文件)
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024 * 1024): # 每次读取1MB
if chunk: # 过滤空块
f.write(chunk)
print(f"\n下载完成!视频已保存至:{save_path}")
except requests.exceptions.RequestException as e:
# 下载失败时删除不完整文件
if os.path.exists(save_path):
os.remove(save_path)
raise Exception(f"视频下载失败:{str(e)}")
def gen_16_9(task_id, title, bottom_text_content, file_name, color_list):
video_gen.gen_16_9_video(
input_video=f"{dir}{file_name}/{task_id}.mp4",
output_video=f"{dir}{file_name}/{task_id}_result.mp4",
# 文字内容
text_content=title,
text_content_color=color_list[1],
title_overlay_opacity=color_list[3],
bottom_overlay_opacity=color_list[4],
overlay_color=color_list[0],
font_size=90,
vertical_font_size=20,
bottom_font_size=50,
bottom_stroke_color=color_list[5],
bottom_stroke_width=color_list[6],
bottom_text_content=bottom_text_content,
bottom_text_content_color=color_list[2],
background_music=f'{dir}Pond.mp3'
)
def gen_9_16(task_id, title, bottom_text_content, file_name, color_list):
font_size = 90
if len(title) > 9:
title = title[:6] + '\n' + title[6:]
font_size = 60
video_gen.gen_9_16_video(
input_video=f"{dir}{file_name}/{task_id}.mp4",
output_video=f"{dir}{file_name}/{task_id}_result.mp4",
# 文字内容
text_content=title,
text_content_color=color_list[1],
title_overlay_opacity=color_list[3],
bottom_overlay_opacity=color_list[4],
overlay_color=color_list[0],
font_size=font_size,
vertical_font_size=25,
bottom_font_size=80,
bottom_stroke_color=color_list[5],
bottom_stroke_width=color_list[6],
bottom_text_content=bottom_text_content,
bottom_text_content_color=color_list[2],
background_music=f'{dir}Pond.mp3'
)
def gen_video_process(model, video_text, video_duration, video_ratio, title, bottom_text_content, file_name):
color_list = random.choice(list)
if model == 1:
task_id = doubao_generate_video(video_text, video_duration, video_ratio)
video_url = doubao_get_video_url(task_id, file_name)
download_video(video_url, f'{dir}{file_name}/', f'{task_id}.mp4')
if video_ratio == '16:9':
gen_16_9(task_id, title, bottom_text_content, file_name, color_list)
elif video_ratio == '9:16':
gen_9_16(task_id, title, bottom_text_content, file_name, color_list)
else:
raise ValueError(f"不支持的比例类型: {video_ratio}")
elif model == 2:
if video_ratio == '16:9':
task_id = sora_generate_video(video_text, video_duration, 'landscape')
video_url = sora_get_video_url(task_id, file_name)
download_video(video_url, f'{dir}{file_name}/', f'{task_id}.mp4')
gen_16_9(task_id, title, bottom_text_content, file_name, color_list)
elif video_ratio == '9:16':
task_id = sora_generate_video(video_text, video_duration, 'portrait')
video_url = sora_get_video_url(task_id, file_name)
download_video(video_url, f'{dir}{file_name}/', f'{task_id}.mp4')
gen_9_16(task_id, title, bottom_text_content, file_name, color_list)
else:
raise ValueError(f"不支持的比例类型: {video_ratio}")
else:
raise ValueError(f"不支持的模型类型: {model}")
mp4_url = f'dso/ai_gen/video/{file_name}/{file_name}.mp4'
mp4_url_nc = f'dso/ai_gen/video/{file_name}/{file_name}_nc.mp4'
json_url = f'dso/ai_gen/video/{file_name}/info.json'
# 上传视频
tos_util.put_video(mp4_url,
f'{dir}{file_name}/{task_id}_result.mp4')
tos_util.put_video(mp4_url_nc,
f'{dir}{file_name}/{task_id}.mp4')
# 提取first frame
extract_first_frame(f'{dir}{file_name}/{task_id}.mp4', f'{dir}{file_name}/{task_id}_nc.jpg')
extract_first_frame(f'{dir}{file_name}/{task_id}_result.mp4', f'{dir}{file_name}/{task_id}.jpg')
jpg_url = f'dso/ai_gen/video/{file_name}/{file_name}.jpg'
jpg_url_nc = f'dso/ai_gen/video/{file_name}/{file_name}_nc.jpg'
# 上传first frame
tos_util.put_video(jpg_url_nc,
f'{dir}{file_name}/{task_id}_nc.jpg')
tos_util.put_video(jpg_url,
f'{dir}{file_name}/{task_id}.jpg')
# 上传json
tos_util.put_string_to_tos(f'{json_url}',
[f'https://static2.douchacha.com/{mp4_url}',
f'https://static2.douchacha.com/{mp4_url_nc}'])
if __name__ == '__main__':
# task = redis_client.rpop('dso_video_gen_task')
#
# try:
# task_json = json.loads(task)
# model = task_json['model']
# video_text = task_json['video_text']
# video_duration = task_json['video_duration']
# video_ratio = task_json['video_ratio']
# title = task_json['title']
# bottom_text_content = task_json['bottom_text_content']
# file_name = task_json['task_id']
# gen_video_process(model, video_text, video_duration, video_ratio, title, bottom_text_content, file_name)
# except Exception as err:
# print(f'{err}')
while 1:
print(doubao_search_video('cgt-20260409205146-hpnpn'))
time.sleep(3)
# # print(sora_search_video('dfb37c95397b88e85b48b055a5ccccbb'))
#
# # 9: 16 7行7字 标题6个字不用换行 411300
# # 16:9 标题12字以内不用处理
# task_json = {"bottom_text_content": "\n天安门庄严雄伟\n天安门庄严雄伟\n天安门庄严雄伟 ", "creationType": "VIDEO",
# "keyword": "天安门", "model": 2, "task_id": "100222822222222611",
# "title": "晨跑好还是夜跑好", "video_duration": 10, "video_ratio": "9:16",
# "video_text": "晨跑好还是夜跑好 俩个人在跑步对比 一个是白天一个是黑夜"}
# try:
# # task_json = json.loads(task)
# model = task_json['model']
# video_text = task_json['video_text']
# video_duration = task_json['video_duration']
# video_ratio = task_json['video_ratio']
# title = task_json['title']
# bottom_text_content = task_json['bottom_text_content']
# file_name = task_json['task_id']
# gen_video_process(model, video_text, video_duration, video_ratio, title, bottom_text_content, file_name)
# except Exception as err:
# # redis_client.rpush('dso_video_gen', task)
# print(f'{err}')
import json
import redis
import subprocess
import time
import os
import psutil # 用于通过PID检查进程是否存活
def main():
# Redis连接配置(根据实际情况修改)
redis_host = '172.16.0.24'
redis_port = 6379
redis_db = 8
redis_password = 'aiyingli@@123'
redis_list_key = 'dso_video_gen' # 存储任务的list键名
redis_list_key_task = 'dso_video_gen_task' # 存储任务的list键名
# 最大并发数
max_concurrent = 10
try:
# 连接Redis
r = redis.Redis(
host=redis_host,
port=redis_port,
db=redis_db,
password=redis_password,
decode_responses=True # 自动将返回值解码为字符串
)
r.ping()
print("成功连接到Redis")
except Exception as e:
print(f"Redis连接失败: {e}")
return
# 存储当前运行的实际任务进程PID(而非shell进程)
task_pids = []
try:
print("开始监控任务队列... (按Ctrl+C停止)")
while True:
# 清理已结束的任务进程(检查PID是否存活)
active_pids = []
for pid in task_pids:
if psutil.pid_exists(pid):
print(f"任务进程PID {pid} 正在执行")
# 进程仍在运行
active_pids.append(pid)
else:
print(f"任务进程PID {pid} 已结束")
task_pids = active_pids # 更新存活的PID列表
# 检查是否可以启动新任务
if len(task_pids) < max_concurrent:
# 从Redis列表左侧获取一个任务
task = r.rpop(redis_list_key)
if task:
print(task)
task = r.lpush(redis_list_key_task,task)
try:
# 启动任务并捕获实际进程的PID
# 命令说明:
# 1. 用nohup启动任务,将PID写入临时文件
# 2. 从临时文件读取PID并记录
pid_file = f"/tmp/video_task_{int(time.time())}.pid"
cmd = (
f'nohup python3.9 ./video_gen_progress.py '
f'> /dev/null 2>&1 & echo $! > {pid_file}'
)
# 执行命令(启动任务并写入PID)
process = subprocess.Popen(
cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# 等待命令执行完成(确保PID文件已生成)
stdout, stderr = process.communicate()
if stderr:
raise Exception(f"命令执行错误: {stderr}")
# 读取PID文件获取实际任务的PID
if os.path.exists(pid_file):
with open(pid_file, 'r') as f:
pid = int(f.read().strip())
os.remove(pid_file) # 清理临时文件
task_pids.append(pid)
print(f"启动任务进程(PID: {pid}),当前并发数: {len(task_pids)}/{max_concurrent}")
else:
raise Exception("未生成PID文件,无法获取任务进程ID")
except Exception as e:
print(f"启动任务失败: {e}")
r.rpush(redis_list_key, task) # 失败时放回队列
time.sleep(5)
else:
# 没有任务时短暂休息
time.sleep(5)
else:
# 达到最大并发数,等待10秒
print(f"已达到最大并发数 {max_concurrent},等待10秒...")
time.sleep(10)
except KeyboardInterrupt:
print("\n收到停止信号,正在等待所有任务结束...")
# 等待所有任务进程结束(可选:如果需要强制终止,可调用psutil.Process(pid).terminate())
for pid in task_pids:
if psutil.pid_exists(pid):
psutil.Process(pid).wait() # 等待进程自然结束
print(f"任务进程PID {pid} 已终止")
print("所有任务已处理完毕,程序退出")
if __name__ == "__main__":
main()
\ No newline at end of file
# clients/tos_client.py
import tos
from aidso_geo.config import TOSConfig # 从配置中读取参数
class TOSClient:
def __init__(self):
self.client = tos.TosClientV2(
TOSConfig.TOSConfig.TOS_AK,
TOSConfig.TOSConfig.TOS_SK,
TOSConfig.TOSConfig.TOS_ENDPOINT,
TOSConfig.TOSConfig.TOS_REGION,
max_retry_count=3 # 客户端默认重试次数
)
self.bucket_name = TOSConfig.TOSConfig.TOS_BUCKET_NAME
def get_client(self):
return self.client
# 单例实例,全局使用
tos_client = TOSClient()
if __name__ == '__main__':
print(TOSConfig.TOSConfig.TOS_AK)
\ No newline at end of file
from .base_config import BaseConfig
class TOSConfig(BaseConfig):
TOS_AK = "AKLTMmRiMmU3YmY5ZjZjNDZkMTlhMmQxY2JkYTllYTQzNDI"
TOS_SK = "WkdWak4yUTRNakl3WVdOa05HUXdaR0V4TlRBM1l6YzJZMll5WkRnMFlUTQ=="
TOS_ENDPOINT = "tos-cn-beijing.volces.com"
TOS_REGION = "cn-beijing"
TOS_BUCKET_NAME = "douchacha-web"
\ No newline at end of file
# config/base_config.py
import json
import os
import time
from enum import Enum
import redis
def init_redis():
try:
redis_client = redis.Redis(
host='172.16.0.24',
# host='redis-cnlfmu7rl14awitrz.redis.ivolces.com',
port=6379,
db=0,
password='aiyingli@@123',
socket_timeout=5,
decode_responses=True # 自动解码为字符串,避免bytes类型问题
)
return redis_client
except Exception as e:
return None
_redis8_client = None
_redis8_pool = None
def init_redis8():
global _redis8_client, _redis8_pool
if _redis8_client is not None:
return _redis8_client
try:
_redis8_pool = redis.ConnectionPool(
host='redis-cnlfmu7rl14awitrz.redis.ivolces.com',
port=6379,
db=8,
password='aiyingli@@123',
socket_timeout=5,
socket_connect_timeout=5,
decode_responses=True,
retry_on_timeout=True,
health_check_interval=30,
socket_keepalive=True,
max_connections=20,
)
_redis8_client = redis.Redis(connection_pool=_redis8_pool)
_redis8_client.ping()
return _redis8_client
except Exception as e:
print(f"Redis 初始化失败: {e}")
_redis8_client = None
_redis8_pool = None
return None
class PlatformType(Enum):
DEEPSEEK = "DP"
DOUBAO = "DB"
YUANBAO = "TXYB"
QIANWEN = "TYQW"
KIMI = "KIMI"
WENXINYIYAN = "WXYY"
BAIDUAI = "BDAI"
DOUYINAI = "DYAI"
DOUBAOANDROID = "DOUBA"
DEEPSEEKANDROID = "DPA"
QIANWENANDROID = "TYQWA"
YUANBAOANDROID = "TXYBA"
XIAOHONGSHUANDROID = "XHSA"
@classmethod
def from_str(cls, platform_str: str) -> "PlatformType":
for member in cls:
if member.value == platform_str:
return member
raise ValueError(f"无效的平台字符串:{platform_str},可选值:{[m.value for m in cls]}")
class BaseConfig:
base_url = "http://172.16.1.37:10444/api/v1/"
PLATFORM_CONFIGS = {
PlatformType.DEEPSEEK.value: {
"url": f"{base_url}deepseek",
"storage_path": lambda tid: f"geo/{tid}/DP/original.text"
},
PlatformType.DOUBAO.value: {
"url": f"{base_url}doubao",
"storage_path": lambda tid: f"geo/{tid}/DB/original.text"
},
PlatformType.YUANBAO.value: {
"url": f"{base_url}yuanbao",
"storage_path": lambda tid: f"geo/{tid}/TXYB/original.text"
},
PlatformType.QIANWEN.value: {
"url": f"{base_url}qianwen",
"storage_path": lambda tid: f"geo/{tid}/TYQW/original.text"
},
PlatformType.KIMI.value: {
"url": f"{base_url}kimi",
"storage_path": lambda tid: f"geo/{tid}/KIMI/original.text"
},
PlatformType.WENXINYIYAN.value: {
"url": f"{base_url}wenxinyiyan",
"storage_path": lambda tid: f"geo/{tid}/WXYY/original.text"
},
PlatformType.BAIDUAI.value: {
"url": f"{base_url}baiduai",
"storage_path": lambda tid: f"geo/{tid}/BDAI/original.text"
},
PlatformType.DOUYINAI.value: {
"url": f"{base_url}aidouyin",
"storage_path": lambda tid: f"geo/{tid}/DYAI/original.text"
},
PlatformType.DOUBAOANDROID.value: {
"url": f"{base_url}doubao_android",
"storage_path": lambda tid: f"geo/{tid}/DOUBA/original.text"
},
PlatformType.DEEPSEEKANDROID.value: {
"url": f"{base_url}deepseek_android",
"storage_path": lambda tid: f"geo/{tid}/DPA/original.text"
},
PlatformType.QIANWENANDROID.value: {
"url": f"{base_url}qianwen_android",
"storage_path": lambda tid: f"geo/{tid}/TYQWA/original.text"
},
PlatformType.YUANBAOANDROID.value: {
"url": f"{base_url}yuanbao_android",
"storage_path": lambda tid: f"geo/{tid}/TXYBA/original.text"
},
PlatformType.XIAOHONGSHUANDROID.value: {
"url": f"{base_url}hongshu_android",
"storage_path": lambda tid: f"geo/{tid}/XHSA/original.text"
}
}
if __name__ == '__main__':
key_list = ['BDAI:geo:stream_batch:list',
'BDAI:geo:batch:list',
'DB:geo:stream_batch:list',
'DB:geo:batch:list',
'DOUBA:geo:stream_batch:list',
'DOUBA:geo:batch:list',
'DP:geo:stream_batch:list',
'DP:geo:batch:list',
'DPA:geo:stream_batch:list',
'DPA:geo:batch:list',
'DYAI:geo:stream_batch:list',
'DYAI:geo:batch:list',
'KIMI:geo:stream_batch:list',
'KIMI:geo:batch:list',
'TXYB:geo:stream_batch:list',
'TXYB:geo:batch:list',
'TXYBA:geo:stream_batch:list',
'TXYBA:geo:batch:list',
'TYQW:geo:stream_batch:list',
'TYQW:geo:batch:list',
'TYQWA:geo:stream_batch:list',
'TYQWA:geo:batch:list',
'WXYY:geo:stream_batch:list',
'WXYY:geo:batch:list',
'geo:task_commit:list']
t = init_redis()
# print(t.scard('mt_third_task'))
while 1:
for k in key_list:
# print(t.smembers(k))
print(f"{k}----{t.llen(k)}")
time.sleep(10)
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from loguru import logger
from apscheduler.schedulers.blocking import BlockingScheduler
from aidso_geo.config.base_config import init_redis
from aidso_geo.models.process import main_process
redis_client = init_redis()
QUEUE_KEY = "geo:task_commit:list"
# 每轮最多拉取任务数量
BATCH_SIZE = 1000
# 每 60 秒执行一次
INTERVAL_SECONDS = 60
# 每轮内部并发数
CONCURRENT_WORKERS = 40
def parse_task(raw):
try:
if isinstance(raw, bytes):
raw = raw.decode("utf-8")
return json.loads(raw)
except Exception as e:
logger.exception(f"解析队列数据失败: {e}, raw={raw}")
return None
def pull_tasks():
"""
每次最多从 Redis 队列拉取 BATCH_SIZE 条任务
"""
task_list = []
for _ in range(BATCH_SIZE):
raw = redis_client.rpop(QUEUE_KEY)
if not raw:
break
data = parse_task(raw)
if data:
task_list.append(data)
return task_list
def handle_one_task(task):
"""
单条任务处理。
成功:正常结束
失败:重新放回 Redis 队列
"""
req_id = task.get("reqId", "")
platform = task.get("platform", "")
prompt = task.get("prompt", "")
try:
main_process(task)
return True
except Exception as e:
logger.exception(f"{req_id}--{platform}--处理失败,重新放回队列: {e}")
redis_client.lpush(
QUEUE_KEY,
json.dumps(task, ensure_ascii=False)
)
return False
def consume_task_queue():
"""
定时任务:
1. 拉取 Redis 队列
2. 使用线程池并发执行 main_process
"""
start_time = time.time()
try:
tasks = pull_tasks()
if not tasks:
logger.info("本轮没有任务")
return
with ThreadPoolExecutor(max_workers=CONCURRENT_WORKERS) as executor:
futures = [
executor.submit(handle_one_task, task)
for task in tasks
]
for future in as_completed(futures):
try:
ok = future.result()
except Exception as e:
logger.exception(f"线程任务异常: {e}")
cost = round(time.time() - start_time, 2)
logger.success(
f"本轮处理完成,总数: {len(tasks)}, "
f"耗时: {cost}s"
)
except Exception as e:
logger.exception(f"定时消费任务异常: {e}")
if __name__ == "__main__":
scheduler = BlockingScheduler(timezone="Asia/Shanghai")
scheduler.add_job(
consume_task_queue,
trigger="interval",
seconds=INTERVAL_SECONDS,
id="consume_geo_task_commit_queue",
max_instances=5, # 允许最多 5 个调度批次同时跑
coalesce=False, # 不合并错过的执行
misfire_grace_time=60
)
logger.success(
f"geo task commit consumer 启动,每 {INTERVAL_SECONDS}s 执行一次,"
f"QUEUE_KEY={QUEUE_KEY}, "
f"BATCH_SIZE={BATCH_SIZE}, "
f"CONCURRENT_WORKERS={CONCURRENT_WORKERS}, "
f"max_instances=5"
)
consume_task_queue()
scheduler.start()
import os
import time
import requests
import json
from collections import defaultdict
from datetime import datetime, timedelta
from openpyxl import Workbook
import aidso_geo.utils.bh_utils as bh_utils
import aidso_geo.utils.tos_utils as tos_utils
class ExcelWriter:
def __init__(self, file_path):
# 如果没有 .xlsx 后缀,自动补上
if not file_path.lower().endswith(".xlsx"):
file_path += ".xlsx"
self.file_path = file_path
# 自动创建父目录
parent_dir = os.path.dirname(self.file_path)
if parent_dir and not os.path.exists(parent_dir):
os.makedirs(parent_dir, exist_ok=True)
self.workbook = Workbook()
self.worksheet = self.workbook.active
self.worksheet.title = "Sheet1"
def write_batch_rows(self, data_list):
"""
批量写入数据
:param data_list: list[dict]
"""
if len(data_list) == 0:
return
# 取第一条数据的 key 作为表头
headers = list(data_list[0].keys())
# 写表头
for col_idx, header in enumerate(headers, start=1):
self.worksheet.cell(row=1, column=col_idx, value=header)
# 写数据
for row_idx, row_data in enumerate(data_list, start=2):
for col_idx, header in enumerate(headers, start=1):
self.worksheet.cell(row=row_idx, column=col_idx, value=row_data.get(header, ""))
# 保存文件
self.workbook.save(self.file_path)
def close(self):
if self.workbook:
self.workbook.close()
def regroup_by_brand(data_list):
brand_map = defaultdict(list)
for item in data_list:
brand_name = (item.get("brand_name") or "").strip()
req_id = item.get("req_id")
created_at = item.get("created_at")
if not brand_name or not req_id:
continue
brand_map[brand_name].append({
"req_id": req_id,
"created_at": created_at
})
result = []
for brand_name, items in brand_map.items():
# 按 created_at 排序
items = sorted(items, key=lambda x: x.get("created_at") or "")
result.append({
"brand_name": brand_name,
"data": items
})
return result
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
"TXYBA": "腾讯元宝安卓版",
"TYQWA": "通义千问安卓版",
}
def to_excel(req_data_list, file_name, output_dir):
if not req_data_list:
print(f"{file_name} 没有可导出的 req 数据")
return
# 1. 构造 req_id -> created_at 映射
req_time_map = {}
req_ids = []
for item in req_data_list:
req_id = item.get("req_id")
created_at = item.get("created_at")
if not req_id:
continue
req_ids.append(req_id)
req_time_map[req_id] = created_at
if not req_ids:
print(f"{file_name} 没有有效 req_id")
return
# 去重,避免 SQL 重复
req_ids = list(dict.fromkeys(req_ids))
# 2. 拼 SQL IN
req_id_sql = ",".join([f"'{req_id}'" for req_id in req_ids])
query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql}) and platform in ('DB','DOUBA','TYQW','TXYB')"
query_list = bh_utils.query_data(query_sql)
if not query_list:
print(f"{file_name} 查询不到 geo_commit_task 数据")
return
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
}
task_list = []
for i in query_list:
task_list.append(
{
"prompt": i.get("prompt"),
"content": tos_utils.get_string_from_tos(f"geo/{i.get('taskId')}/{i.get('platform')}/context.txt"),
"platform": plat_form_map[i.get("platform")],
"created_at": req_time_map.get(i.get('reqId')),
}
)
excel_writer = ExcelWriter(output_dir)
excel_writer.write_batch_rows(task_list)
excel_writer.close()
print(f"{file_name} 数据已成功写入Excel文件: {output_dir}")
def export_all_brand_excel(data_list, phone, begin):
"""
按品牌批量导出 Excel
"""
grouped_data = regroup_by_brand(data_list)
if not grouped_data:
print("没有可分组导出的数据")
return
for item in grouped_data:
brand_name = item.get("brand_name")
req_data_list = item.get("data", [])
out_date = f"{begin.replace('-', '')}"
output_dir = f"/Users/yaowentong/Desktop/original_{out_date}/{phone}/{out_date}/{brand_name}_{out_date}_original"
to_excel(req_data_list, brand_name, output_dir=output_dir)
def get_req_id(phone, begin, end):
url = f"https://openapi.aidso.com/openapi/ywt/geoReqList?phone={phone}&begin={begin} 00:00:00&end={end} 23:59:59"
payload = {}
headers = {
'Authorization': ''
}
response = requests.request("GET", url, headers=headers, data=payload)
return response.json().get('data')
def get_url_top40(user_id, brand_library_id, begin, end, p):
url = "https://openapi.aidso.com/openapi/ywt/trends"
if isinstance(p, str):
p = [p]
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f'{begin} 00:00:00',
"end_time": f'{end} 23:59:59',
"limit": 40,
"platform_list": p
})
headers = {
'Authorization': '',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get('data')
def export_base_info(phone, begin, end):
data_list = get_req_id(phone, begin, end)
export_all_brand_excel(data_list, phone, begin)
def generate_date_list(start_date: str, end_date: str):
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
if start > end:
return []
result = []
current = start
while current <= end:
result.append(current.strftime("%Y-%m-%d"))
current += timedelta(days=1)
return result
def get_platforms(brand_library_id, user_id, begin_time, end_time):
url = "https://openapi.aidso.com/openapi/ywt/platforms"
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59"
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", {})
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
except ValueError as e:
print(f"返回结果不是合法 JSON: {e}")
return []
# print(response.text)
def get_all_word_rank(brand_library_id, user_id, begin_time, end_time):
url = "https://openapi.aidso.com/openapi/ywt/all_word_rank"
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59",
"last": True
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", [])
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
except ValueError as e:
print(f"返回结果不是合法 JSON: {e}")
return []
def get_sentiment_overview(brand_library_id, user_id, begin_time, end_time):
url = "https://openapi.aidso.com/openapi/ywt/overview"
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59"
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", {})
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
# response = requests.request("POST", url, headers=headers, data=payload)
#
# print(response.text)
def get_platforms_sentiment(brand_library_id, user_id, begin_time, end_time):
url = "https://openapi.aidso.com/openapi/ywt/platforms/sentiment"
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59",
"last": True
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", {})
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
def get_site_category(brand_library_id, user_id, begin_time, end_time, p):
url = "https://openapi.aidso.com/openapi/ywt/statistics/site_category"
if isinstance(p, str):
p = [p]
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59",
"platform_list": p,
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", [])
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
def get_quote_mention_word_cloud(brand_library_id, user_id, begin_time, end_time):
url = "https://openapi.aidso.com/openapi/ywt/quote_mention_word_cloud"
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59"
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", {})
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
def get_quto_statistics(brand_library_id, user_id, begin_time, end_time):
url = "https://openapi.aidso.com/openapi/ywt/statistics"
payload = json.dumps({
"brand_library_id": brand_library_id,
"user_id": user_id,
"begin_time": f"{begin_time} 00:00:00",
"end_time": f"{end_time} 23:59:59"
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(
url,
headers=headers,
data=payload,
timeout=15
)
response.raise_for_status()
result = response.json()
if result.get("code") == 200:
return result.get("data", {})
return []
except requests.RequestException as e:
print(f"请求失败: {e}")
return []
def cha_report(phone_list, begin, end):
for ph in phone_list:
for de in generate_date_list(begin, end):
data_list = get_req_id(ph, de, de)
out_date = f"{de.replace('-', '')}"
export_base_info(ph, de, de)
print(f"ph:{ph}---de{de}开始执行")
result = []
seen = set()
if data_list:
for i in data_list:
item = {
"user_id": i.get("user_id"),
"brand_library_id": i.get("brand_library_id"),
"brand_name": i.get("brand_name")
}
key = (item["user_id"], item["brand_library_id"], item["brand_name"])
if key not in seen:
seen.add(key)
result.append(item)
for r in result:
platforms = ["DB", "DOUBA", "TYQW", "TXYB"]
for p in platforms:
t_data = get_url_top40(r.get('user_id'), r.get('brand_library_id'), de, de, p)
fields = ["site_name", "log_count", "quote_ratio_percent"]
result = [{k: item.get(k) for k in fields} for item in t_data]
out_path = f"/Users/yaowentong/Desktop/source_{out_date}/{ph}/{out_date}{p}_source/{r.get('brand_name')}_{out_date}_source"
writer = ExcelWriter(out_path)
writer.write_batch_rows(result)
writer.close()
def qian_report(phone, begin, end, brand_name,platform=None):
req_list = get_req_id(phone, begin, end)
req_time_map = {}
req_ids = []
for item in req_list:
if item.get('brand_name') == brand_name:
req_id = item.get("req_id")
created_at = item.get("created_at")
req_ids.append(req_id)
req_time_map[req_id] = created_at
req_id_sql = ",".join([f"'{req_id}'" for req_id in req_ids])
if platform:
platform_list = ",".join([f"'{p}'" for p in platform])
query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql}) and platform in ({platform_list}) "
else:
query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql})"
query_list = bh_utils.query_data(query_sql)
result = []
if query_list:
for q in query_list:
r = {
"prompt": q.get('prompt'),
"platform": plat_form_map[q.get('platform')],
"content": tos_utils.get_string_from_tos(f"geo/{q.get('taskId')}/{q.get('platform')}/context.txt"),
"quote": tos_utils.get_string_from_tos(f"geo/{q.get('taskId')}/{q.get('platform')}/quote.txt"),
"created_at": req_time_map[q.get('reqId')],
}
result.append(r)
all_file = f"/Users/yaowentong/Desktop/{brand_name}_all.txt"
lite_file = f"/Users/yaowentong/Desktop/{brand_name}_lite.txt"
with open(all_file, "w", encoding="utf-8") as f:
for item in result:
f.write(json.dumps(item, ensure_ascii=False) + "\n\n\n")
remove_keys = {"url", "site_icon", "task_id", "quto_id"}
data = []
with open(all_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
item = json.loads(line)
quote = item.get("quote")
if quote:
quote_list = json.loads(quote) # 把 quote 字符串转成 list
for one in quote_list: # one 是每个 dict
if isinstance(one, dict):
for k in remove_keys:
one.pop(k, None)
item["quote"] = json.dumps(quote_list, ensure_ascii=False)
data.append(item)
with open(lite_file, "w", encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n\n\n")
def zhou_report(phone, begin, end, brand_name):
req_list = get_req_id(phone, begin, end)
req_time_map = {}
req_ids = []
brand_id = 0
user_id = 0
for item in req_list:
if item.get('brand_name') == brand_name:
req_id = item.get("req_id")
user_id = item.get("user_id")
brand_id = item.get("brand_library_id")
created_at = item.get("created_at")
req_ids.append(req_id)
req_time_map[req_id] = created_at
print(req_ids)
result = []
platform_data = get_platforms(brand_id, user_id, begin, end)
geo_refer_rank_overview_vos = platform_data.get('geo_refer_rank_overview_vos')
brand_overview = platform_data.get('brand_overview')
for item in geo_refer_rank_overview_vos:
result.append({
"平台": plat_form_map.get(item.get("platform"), item.get("platform")),
"对话次数": item.get("ai_count"),
"提及率": item.get("refer_rate"),
"Top1提及率": item.get("top1_refer_rate"),
"Top3提及率": item.get("top3_refer_rate"),
"Top10提及率": item.get("top10_refer_rate"),
"平均提及排名": item.get("avg_rank"),
"提及好感度": item.get("sentiment_score"),
})
brand_overview_result = []
brand_overview_result.append({
'对话次数': brand_overview.get('ai_count'),
'提及对话次数': brand_overview.get('valid_rank_count'),
'提及率': brand_overview.get('refer_rate'),
'top1提及率': brand_overview.get('top1_refer_rate'),
'top3提及率': brand_overview.get('top3_refer_rate'),
'top10提及率': brand_overview.get('top10_refer_rate'),
'品牌提及次数': brand_overview.get('mention_count'),
'平均提及排名': brand_overview.get('avg_rank'),
'品牌提及好感度': brand_overview.get('sentiment_score'),
'品牌得分': brand_overview.get('brand_score')
})
# 概览数据
print(f"概览数据:{brand_overview_result}")
# 分平台数据
print(f"分平台数据:{result}")
all_word = get_all_word_rank(brand_id, user_id, begin, end)
all_word_result = []
for all in all_word:
all_word_result.append({
"品牌提及次数": all.get('ref_count'),
"品牌名称": all.get("word"),
"品牌得分": all.get("score"),
"品牌提及率": all.get("rate"),
"平均提及排名": all.get("avg_rank")
})
# 全部品牌词数据
print(f"全部品牌词数据{all_word_result}")
overview_result = []
sentiment_overview = get_sentiment_overview(brand_id, user_id, begin, end)
overview_result.append(
{
"提及品牌AI对话次数": sentiment_overview.get('sentiment_count'),
"品牌提及好感度": sentiment_overview.get('sentiment_score'),
"正面情感": sentiment_overview.get('positive_count'),
"中性情感": sentiment_overview.get('neutral_count'),
"负面情感": sentiment_overview.get('negative_count'),
"全部引用文章": sentiment_overview.get('article_count'),
"有品牌内容引用文章": sentiment_overview.get('has_brand_count'),
# todo 负面引用文章
"负面引用文章": sentiment_overview.get('negative_count')
}
)
print(f"overview_result{overview_result}")
platform_sentiment_result = []
platform_sentiment = get_platforms_sentiment(brand_id, user_id, begin, end)
for pla in platform_sentiment:
platform_sentiment_result.append({
"平台": plat_form_map.get(pla.get("platform"), pla.get("platform")),
"正面": round(pla.get('positive_count') / pla.get('sentiment_count'), 2),
"中性": round(pla.get('neutral_count') / pla.get('sentiment_count'), 2),
"负面": round(pla.get('negative_count') / pla.get('sentiment_count'), 2),
"提及品牌次数": pla.get('brand_words_count'),
"引用来源": pla.get('source_count'),
})
print(platform_sentiment_result)
platform_list = ["DB", "DP", "TYQW", "TXYB", "BDAI", "DOUBA", "DPA", "DYAI", "KIMI", "WXYY"]
url_top_40 = get_url_top40(user_id, brand_id, begin, end, platform_list)
url_top_40_result = []
for url in url_top_40:
url_top_40_result.append({
"网站名称": url.get('site_name'),
"引用次数": url.get('log_count'),
"引用占比": url.get('quote_ratio_percent'),
})
print(url_top_40_result)
site_category = get_site_category(brand_id, user_id, begin, end, platform_list)
site_category_result = []
for site in site_category:
site_category_result.append({
"站点类型": site.get('category'),
"站点数量": site.get('site_count'),
"引用文章数": site.get('resource_count'),
"引用次数": site.get('total_log_count'),
"引用率": site.get('quote_rate'),
})
print(site_category_result)
quote_mention_word_cloud = get_quote_mention_word_cloud(brand_id, user_id, begin, end)
positive_top = quote_mention_word_cloud.get('positive_top')
negative_top = quote_mention_word_cloud.get('negative_top')
positive_top_result = []
for positive in positive_top:
positive_top_result.append({
"关键词": positive.get('word'),
"出现次数": positive.get('count'),
})
negative_top_result = []
for negative in negative_top:
negative_top_result.append({
"关键词": negative.get('word'),
"出现次数": negative.get('count'),
})
print(positive_top_result)
print(negative_top_result)
quto_statistics_result = []
quto_statistics = get_quto_statistics(brand_id, user_id, begin, end)
quto_statistics_result.append({
"引用文章数": quto_statistics.get('resource_count'),
"引用次数": quto_statistics.get('total_log_count'),
"引用网站": quto_statistics.get('site_count'),
})
print(quto_statistics_result)
def cha_report_video(videoId):
url = f"http://172.16.1.37:8874/getVideoDetail2?videoId={videoId}"
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
try:
return response.json().get('data')[0]
except Exception as e:
return None
def cha_report_user(uid):
url = f"http://172.16.1.37:8873/getUserInfoUid?uid={uid}"
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
return response.json().get('data')
def get_detail(keyword,monday,sunday):
url = f"http://172.16.1.37:8873/SuanShuGetRelationWord?keyword={keyword}&monday={monday}&sunday={sunday}"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
try:
response_json = response.json()
if response_json.get('code') == 200:
return response_json.get('data')
except Exception as e:
print(e)
def get_portrait(keyword,monday,sunday):
url = f"http://172.16.1.37:8873/SuanShuGetPortrait?keyword={keyword}&monday={monday}&sunday={sunday}"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
try:
response_json = response.json()
if response_json.get('code') == 200:
return response_json.get('data')
except Exception as e:
print(e)
def get_full_weeks(start_time: str, end_time: str):
start_date = datetime.strptime(start_time, "%Y%m%d").date()
end_date = datetime.strptime(end_time, "%Y%m%d").date()
if start_date > end_date:
return []
# 找到 >= start_date 的第一个周一
days_to_monday = (7 - start_date.weekday()) % 7
first_monday = start_date + timedelta(days=days_to_monday)
result = []
current_monday = first_monday
while True:
current_sunday = current_monday + timedelta(days=6)
# 周日必须 <= end_date,才算完整包含
if current_sunday > end_date:
break
result.append((
current_monday.strftime("%Y%m%d"),
current_sunday.strftime("%Y%m%d")
))
current_monday += timedelta(days=7)
return result
def suanshu_export(start,end,keyword_list,file_name):
# start = "20250602"
# end = "20250630"
# keyword_list = ['旅游', '美食', '景点', '亲子游', '周边游', '避暑', '自驾游', '民宿', '海边', '古镇']
data_list = get_full_weeks(start, end)
for keyword in keyword_list:
result = []
for da in data_list:
detail_result = get_detail(keyword, da[0], da[1])
result.append({
"type": "detail",
"keyword": keyword,
"start": da[0],
"end": da[1],
"result": detail_result
})
portrait_result = get_portrait(keyword, start, end)
result.append({
"type": "portrait",
"keyword": keyword,
"start": start,
"end": end,
"result": portrait_result
})
result_file = f"{file_name}/{keyword}.txt"
with open(result_file, "w", encoding="utf-8") as f:
for item in result:
f.write(json.dumps(item, ensure_ascii=False) + "\n\n\n")
def dao_report(phone, begin, end, brand_name,platform=None):
req_list = get_req_id(phone, begin, end)
req_time_map = {}
req_ids = []
for item in req_list:
if item.get('brand_name') == brand_name:
req_id = item.get("req_id")
created_at = item.get("created_at")
req_ids.append(req_id)
req_time_map[req_id] = created_at
req_id_sql = ",".join([f"'{req_id}'" for req_id in req_ids])
if platform:
platform_list = ",".join([f"'{p}'" for p in platform])
query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql}) and platform in ({platform_list}) "
else:
query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql})"
query_list = bh_utils.query_data(query_sql)
result = []
if query_list:
for q in query_list:
r = {
"prompt": q.get('prompt'),
"platform": plat_form_map[q.get('platform')],
"search_keyword": tos_utils.get_string_from_tos(f"geo/{q.get('taskId')}/{q.get('platform')}/search_keyword.txt"),
"created_at": req_time_map[q.get('reqId')],
}
result.append(r)
all_file = f"/Users/yaowentong/Desktop/{brand_name}_dao_report.txt"
with open(all_file, "w", encoding="utf-8") as f:
for item in result:
f.write(json.dumps(item, ensure_ascii=False) + "\n\n\n")
# =========================
# 使用示例
# =========================
if __name__ == "__main__":
# print(cha_report_user(3738753854605260))
# keyword_list = ['618']
# start = '20250518'
# end = '20250622'
# file_name = '/Users/yaowentong/Desktop/'
# 13810898434
# 豆包手机版
# AIDSO爱搜 这个品牌
# 最近7天
# 所有问题
phone = 13810898434
begin = '2026-05-13'
end = '2026-05-20'
brand_name = 'AIDSO爱搜'
platform = ['DB']
dao_report(phone, begin, end, brand_name, platform)
# qian_report(start, end, keyword_list, file_name)
\ No newline at end of file
# -*- coding: utf-8 -*-
import secrets
import string
import time
import uuid
import threading
import redis
import openpyxl
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import os
from datetime import datetime, timedelta
from openpyxl.workbook import Workbook
import os ,sys
from loguru import logger
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import bh_utils, tos_utils
# =========================
# 配置区
# =========================
import requests
import json
def get_token():
url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
payload = json.dumps({
"app_id": "cli_a903ede61ff99bcc",
"app_secret": "yiJkjDnHcbYhIWD0Hb26OfuxUXAu13ra"
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get('tenant_access_token')
container_id = "oc_41bc8e496a0d9bb93498a6a6cf6f30c1"
# 每天 23:00 生成第二天任务
SCHEDULE_HOUR = 23
SCHEDULE_MINUTE = 0
SCHEDULE_SECOND = 0
# 等待任务跑完(Redis mt_third_task 为空)轮询间隔
POLL_SECONDS = 180
# 到第二天 00:00 后稍微缓一会儿再开跑,避免边界抖动
AFTER_MIDNIGHT_DELAY_SECONDS = 60
# Excel 存放目录
# EXCEL_DIR = "/opt/file"
EXCEL_DIR = "/Users/yaowentong/Desktop"
# Redis 去重锁过期时间(秒)
INIT_TASK_LOCK_EXPIRE = 3 * 24 * 3600 # 3天:防止 key 堆积
RUNNER_LOCK_EXPIRE = 3 * 24 * 3600 # 3天:防止 key 堆积
# =========================
# 飞书上传/发消息
# =========================
def upload_file(file_path, file_name):
url = "https://open.feishu.cn/open-apis/im/v1/files"
headers = {"Authorization": f"Bearer {get_token()}"}
files = {
"file": (
file_name,
open(file_path, "rb"),
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
}
data = {
"file_type": "xlsx",
"file_name": file_name
}
try:
resp = requests.post(url=url, headers=headers, files=files, data=data, timeout=30)
rj = resp.json()
return (rj.get("data") or {}).get("file_key")
except FileNotFoundError as e:
logger.success(f"错误:文件不存在 - {e}")
except Exception as e:
logger.success(f"请求失败:{e}")
finally:
try:
f = files.get("file")[1]
if f:
f.close()
except Exception:
pass
return None
def chat_message(file_key):
url = "https://open.feishu.cn/open-apis/im/v1/messages?receive_id_type=chat_id"
payload = json.dumps({
"content": f'{{"file_key":"{file_key}"}}',
"msg_type": "file",
"receive_id": container_id,
"uuid": str(uuid.uuid4())
})
headers = {
"Authorization": f"Bearer {get_token()}",
"Content-Type": "application/json"
}
resp = requests.request("POST", url, headers=headers, data=payload, timeout=30)
logger.success(resp.text)
def send_message(message):
url = "https://open.feishu.cn/open-apis/im/v1/messages?receive_id_type=chat_id"
payload = {
"receive_id": container_id,
"msg_type": "text",
"content": json.dumps({
"text": message
}, ensure_ascii=False)
}
headers = {
'Authorization': f'Bearer {get_token()}',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=json.dumps(payload))
logger.success(response.text)
def get_task():
all_items = []
page_token = None
while True:
url = f"https://open.feishu.cn/open-apis/docx/v1/chats/{container_id}/announcement/blocks?page_size=500&revision_id=-1"
headers = {
"Authorization": f"Bearer {get_token()}"
}
params = {}
if page_token:
params["page_token"] = page_token
try:
resp = requests.get(url, headers=headers, params=params, timeout=30)
resp.raise_for_status()
data = resp.json()
page_data = data.get("data") or {}
items = page_data.get("items") or []
has_more = page_data.get("has_more", False)
page_token = page_data.get("page_token")
all_items.extend(items)
if not has_more:
break
except Exception as e:
logger.success(f"请求失败:{e}")
break
return all_items
# =========================
# Redis
# =========================
def init_redis4():
try:
redis_client = redis.Redis(
host="172.16.0.24",
port=6379,
db=4,
password="aiyingli@@123",
socket_timeout=5,
decode_responses=True
)
return redis_client
except Exception as e:
logger.success(f"redis 初始化失败: {e}")
return None
def wait_mt_tasks_done(redis_client):
"""
等待 redis set mt_third_task 为空:
- 为空 -> 认为“采集任务都跑完了”
- 不为空 -> 不发飞书,继续等
"""
while True:
try:
size = redis_client.scard("mt_third_task")
logger.success(f"size:{size}")
except Exception as e:
logger.success(f"redis 读取 mt_third_task 失败: {e}")
size = -1
if size == 0:
return
time.sleep(POLL_SECONDS)
# =========================
# 任务初始化:每天 23:00 抓公告,写第二天 pt 的任务
# =========================
def init_task(pt_for_tomorrow: str):
"""
生成第二天任务:
- 从飞书群公告 blocks 里取每行文本作为 prompt
- 生成 3 轮(cn=0/1/2)写入 geo_third_task_log
"""
task_list = get_task()
promp = []
for i in task_list:
try:
if i.get("text"):
elements = (i.get("text") or {}).get("elements") or []
if not elements:
continue
content = (((elements[0] or {}).get("text_run") or {}).get("content")) or ""
content = content.strip()
if content:
promp.append(content)
except Exception:
continue
if not promp:
logger.success(f"[init_task] 公告里没有取到 prompt,pt={pt_for_tomorrow}")
return
data_list = []
for cn in range(3):
for index,p in enumerate(promp):
req_id = str(uuid.uuid4())
data_list.append({
"reqId": req_id,
"prompt": p,
"platform": "DB",
"cn": cn,
"pt": pt_for_tomorrow,
"rank": index
})
bh_utils.insert_data("geo_third_task_log", data_list)
logger.success(f"[init_task] 已写入任务: pt={pt_for_tomorrow}, prompts={len(promp)}, rows={len(data_list)}")
def init_task_once(pt_for_tomorrow: str):
"""
只对某个 pt 生成一次任务(避免 scheduler 线程重复触发或重启重复写)
"""
r = init_redis4()
if not r:
# redis 不可用时,退化为直接执行(可能重复写入,至少不影响流程)
init_task(pt_for_tomorrow)
return
lock_key = f"mt:init_task_done:{pt_for_tomorrow}"
ok = r.set(lock_key, "1", nx=True, ex=INIT_TASK_LOCK_EXPIRE)
if ok:
init_task(pt_for_tomorrow)
else:
logger.success(f"[init_task_once] 已生成过 pt={pt_for_tomorrow},跳过")
# =========================
# 解析豆包 SSE 原始数据(保留)
# =========================
def doubao_process_original_data(file_path, original_content):
url_list = ""
think_content = ""
response_content = ""
search_keyword = []
suggestions = []
is_think = False
rich_media_block = []
think_bool = False
response_bool = False
content_list = original_content.split("\n")
for i in content_list:
if i != "":
if not i.startswith("data:"):
continue
payload = i[len("data:"):].lstrip()
if not payload:
continue
try:
json_content = json.loads(payload)
except (IndexError, json.JSONDecodeError):
continue
if json_content.get('event_type') == 2001:
even_data = json.loads(json_content.get('event_data'))
message_data = even_data.get('message')
if even_data.get('tts_content') is not None:
response_content = even_data.get('tts_content')
if message_data.get('content_type') == 2007:
for i in json.loads(message_data.get('content')).get("search_result").get("video_card").get(
"card_list"):
rich_media_block.append(i)
if message_data.get('content_type') == 10040 and message_data.get('is_finish') is None:
think_bool = True
continue
if message_data.get('content_type') == 10040 and message_data.get('is_finish') == True:
think_bool = False
continue
if think_bool:
if json.loads(message_data.get('content')).get('text') is not None:
think_content += json.loads(message_data.get('content')).get('text')
content_json = json.loads(message_data.get('content'))
if message_data.get('content_type') == 10025 and content_json.get('results') is not None:
think_content += "\n\n"
think_content += "**搜索"
think_content += str(len(content_json.get('queries')))
think_content += "个关键词,参考"
think_content += str(len(content_json.get('results')))
think_content += "篇文章**"
think_content += "\n\n"
if message_data.get('content_type') == 10025:
content_json = json.loads(message_data.get('content'))
if content_json.get('queries') is not None and content_json.get('results') is not None:
search_keyword = search_keyword + json.loads(message_data.get('content')).get('queries')
if content_json.get('scene') == 2:
url_list = content_json.get('results')
if message_data.get('content_type') == 2002:
suggestions = suggestions + json.loads(message_data.get('content')).get('suggestions')
else:
if json_content.get("patch_op"):
if json_content.get('patch_op')[0].get("patch_object") == 111:
if json_content.get('patch_op')[0].get("patch_value").get("tts_content"):
response_content += json_content.get('patch_op')[0].get("patch_value").get("tts_content")
if json_content.get('patch_op')[0].get("patch_object") == 1:
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block"):
search_keyword = json_content.get('patch_op')[0].get("patch_value").get("content_block")[
0].get("content").get("search_query_result_block").get("queries")
url_list = json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block").get("results")
if is_think == True and \
json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block").get("results") is not None:
think_content += "\n\n"
think_content += "**"
think_content += \
json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block").get("summary")
think_content += "**"
think_content += "\n\n"
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"block_type") == 10000 and \
json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"parent_id") and len(json_content.get('patch_op')) > 1:
is_think = True
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get('text_block').get("text"):
think_content += \
json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("text_block").get("text")
continue
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"block_type") == 10040:
is_think = False
continue
if json_content.get('patch_op')[0].get("patch_object") == 50:
for sug in json.loads(
json_content.get('patch_op')[0].get("patch_value").get("ext").get("sp_v2")):
suggestions.append(sug.get("content"))
if is_think:
if json_content.get("text"):
think_content += json_content.get("text")
else:
if json_content.get("content"):
content_block = json_content.get("content").get('content_block')
if content_block:
if len(content_block) > 0:
if content_block[0].get('block_type') == 10050:
for i in content_block[0].get('content').get('rich_media_block').get('creations'):
rich_media_block.append(i.get('video'))
if content_block[0].get('block_type') == 10000:
if content_block[0].get("content").get("text_block").get("text"):
response_content = content_block[0].get("content").get("text_block").get("text")
suggestions = list(set(suggestions))
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
def timestamp_to_datetime(timestamp, fmt="%Y-%m-%d %H:%M:%S"):
timestamp = int(timestamp)
local_time = datetime.fromtimestamp(timestamp)
return local_time.strftime(fmt)
# =========================
# ExcelWriter
# =========================
class ExcelWriter:
"""
Excel写入工具类,专门用于写入taskID和response两列数据
"""
def __init__(self, file_path):
"""
初始化Excel写入工具
:param file_path: Excel文件保存路径(如: './output.xlsx')
"""
self.file_path = file_path
self.workbook = None
self.worksheet = None
# 初始化工作簿和工作表
self._init_workbook()
def _init_workbook(self):
"""初始化工作簿和工作表,若文件已存在则打开,不存在则新建"""
# 检查文件是否存在
if os.path.exists(self.file_path):
self.workbook = openpyxl.load_workbook(self.file_path)
# 取第一个工作表
self.worksheet = self.workbook.active
# 检查表头是否存在,不存在则添加
if self.worksheet.cell(row=1, column=1).value != 'taskID' or \
self.worksheet.cell(row=1, column=2).value != 'response':
# 在第一行插入表头
self.worksheet.insert_rows(1)
self.worksheet.cell(row=1, column=1, value='taskID')
self.worksheet.cell(row=1, column=2, value='response')
else:
# 新建工作簿
self.workbook = Workbook()
self.worksheet = self.workbook.active
# 设置表头
self.worksheet.cell(row=1, column=1, value='问题')
self.worksheet.cell(row=1, column=2, value='平台')
self.worksheet.cell(row=1, column=3, value='回答')
self.worksheet.cell(row=1, column=4, value='思考过程')
self.worksheet.cell(row=1, column=5, value='引用来源')
self.worksheet.cell(row=1, column=6, value='分享链接')
self.worksheet.cell(row=1, column=7, value='截图')
self.worksheet.cell(row=1, column=8, value='查询时间')
self.worksheet.cell(row=1, column=9, value='ocr结果')
self.worksheet.cell(row=1, column=10, value='排名')
def write_batch_rows(self, data_list):
if not isinstance(data_list, list) or len(data_list) == 0:
raise ValueError("data_list必须是非空的列表")
# "taskId": i.get("taskId"),
# "prompt": i.get("prompt"),
# "platform": i.get("platform"),
# "insertime": i.get("insertime"),
# 找到下一个空行
next_row = self.worksheet.max_row + 1
# 批量写入数据
for idx, (prompt, platform, content, think, quote,share, snipaste, insertime,ocr,rank) in enumerate(data_list):
self.worksheet.cell(row=next_row + idx, column=1, value=prompt)
self.worksheet.cell(row=next_row + idx, column=2, value=platform)
self.worksheet.cell(row=next_row + idx, column=3, value=content)
self.worksheet.cell(row=next_row + idx, column=4, value=think)
self.worksheet.cell(row=next_row + idx, column=5, value=quote)
self.worksheet.cell(row=next_row + idx, column=6, value=share)
self.worksheet.cell(row=next_row + idx, column=7, value=snipaste)
self.worksheet.cell(row=next_row + idx, column=8, value=insertime)
self.worksheet.cell(row=next_row + idx, column=9, value=ocr)
self.worksheet.cell(row=next_row + idx, column=10, value=rank)
# 保存文件
self.workbook.save(self.file_path)
def close(self):
"""关闭工作簿,释放资源"""
if self.workbook:
self.workbook.close()
# =========================
# 导出 Excel
# =========================
def get_today_date_string():
import datetime as d
today = d.date.today()
return today.strftime("%Y%m%d")
def rand_16():
chars = string.digits + string.ascii_lowercase # 0-9a-z
return ''.join(secrets.choice(chars) for _ in range(16))
def get_context(image_url):
url = "https://ark.cn-beijing.volces.com/api/v3/responses"
payload = json.dumps({
"model": "doubao-seed-2-0-lite-260428",
"input": [
{
"role": "user",
"content": [
{
"type": "input_image",
"image_url": image_url
},
{
"type": "input_text",
"text": "提取图中文字"
}
]
}
]
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload)
data = response.json()
return next(
(
content.get("text", "")
for item in data.get("output", [])
if item.get("type") == "message"
for content in item.get("content", [])
if content.get("type") == "output_text"
),
""
)
except Exception as e:
return ""
def to_excel2(pt, cn):
query_list_log = bh_utils.query_data(
f"SELECT DISTINCT reqId FROM geo_third_task_log where cn = {cn} and pt = {pt}"
)
uuid_list = []
for i in query_list_log:
uuid_list.append(i.get('reqId'))
uuid_str = ",".join([f"'{uuid}'" for uuid in uuid_list])
query_list = bh_utils.query_data(
f"SELECT DISTINCT reqId,prompt, platform,insertime,rank FROM geo_third_task WHERE reqId IN ({uuid_str}) order by rank asc"
)
task_list = []
for i in query_list:
task_list.append(
{
"reqId": i.get("reqId"),
"prompt": i.get("prompt"),
"platform": i.get("platform"),
"insertime": i.get("insertime"),
"rank": i.get("rank")
}
)
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
}
def build_row(i):
reqId = i.get("reqId")
prompt = i.get("prompt")
platform = i.get("platform")
insertime = i.get("insertime")
rank = i.get("rank")
file = f"geo_snipaste/{pt}/doubao/{reqId}/text.json"
context_path = f"geo_snipaste/{pt}/doubao/{reqId}/context.txt"
quote_path = f"geo_snipaste/{pt}/doubao/{reqId}/quote.txt"
think = f"geo_snipaste/{pt}/doubao/{reqId}/think.txt"
png_url = f'https://tcdn.aidso.com/geo_snipaste/{pt}/doubao/{reqId}/png.png'
text_json_str = tos_utils.get_string_from_tos(file) or "{}"
return (
prompt,
plat_form_map.get(platform),
tos_utils.get_string_from_tos(context_path),
tos_utils.get_string_from_tos(think),
tos_utils.get_string_from_tos(quote_path),
json.loads(text_json_str).get('share_url', ''),
png_url,
timestamp_to_datetime(insertime),
get_context(png_url),
rank
)
batch_data = [None] * len(task_list)
with ThreadPoolExecutor(max_workers=50) as executor:
future_map = {
executor.submit(build_row, i): index
for index, i in enumerate(task_list)
}
for future in as_completed(future_map):
index = future_map[future]
try:
batch_data[index] = future.result()
except Exception as e:
logger.exception(f"导出数据失败 index={index}, task={task_list[index]}, err={e}")
batch_data[index] = None
# 去掉失败的数据
batch_data = [row for row in batch_data if row is not None]
logger.success(f"时间{pt}--{cn}---导出数据{len(batch_data)}条")
out_path = os.path.join(EXCEL_DIR, f"{pt}美团{cn}.xlsx")
excel_writer = ExcelWriter(out_path)
excel_writer.write_batch_rows(batch_data)
excel_writer.close()
return out_path
# batch_data = []
# for i in task_list:
# reqId = i.get("reqId")
# prompt = i.get("prompt")
# platform = i.get("platform")
# insertime = i.get("insertime")
# rank = i.get("rank")
# file = f"geo_snipaste/{pt}/doubao/{reqId}/text.json"
# context_path =f"geo_snipaste/{pt}/doubao/{reqId}/context.txt"
# quote_path = f"geo_snipaste/{pt}/doubao/{reqId}/quote.txt"
# think = f"geo_snipaste/{pt}/doubao/{reqId}/think.txt"
# batch_data.append(
# (
# prompt,
# plat_form_map.get(platform),
# tos_utils.get_string_from_tos(context_path),
# tos_utils.get_string_from_tos(think),
# tos_utils.get_string_from_tos(quote_path),
# json.loads(tos_utils.get_string_from_tos(file)).get('share_url', ''),
# f'https://tcdn.aidso.com/geo_snipaste/{pt}/doubao/{reqId}/png.png',
# timestamp_to_datetime(insertime),
# get_context(f'https://tcdn.aidso.com/geo_snipaste/{pt}/doubao/{reqId}/png.png'),
# rank
# )
# )
# logger.success(f"时间{pt}--{cn}---导出数据{len(batch_data)}条")
# out_path = os.path.join(EXCEL_DIR, f"{pt}美团{cn}.xlsx")
# excel_writer = ExcelWriter(out_path)
# excel_writer.write_batch_rows(batch_data)
# excel_writer.close()
# return out_path
# =========================
# 时间等待工具
# =========================
def wait_until_time(hour: int, minute: int, second: int = 0):
now = datetime.now()
target = now.replace(hour=hour, minute=minute, second=second, microsecond=0)
if target <= now:
target = target + timedelta(days=1)
time.sleep(max(0, (target - now).total_seconds()))
def wait_until_date(date_obj):
while True:
if datetime.now().date() >= date_obj:
return
time.sleep(60)
# =========================
# 处理 cn:必须等 mt_third_task 为空才发消息
# =========================
def process_cn_after_tasks_done(pt: str, cn: int):
redis_client = init_redis4()
logger.success(f"[pt={pt} cn={cn}] 等待 mt_third_task 跑空...")
wait_mt_tasks_done(redis_client)
logger.success(f"[pt={pt} cn={cn}] mt_third_task 已为空,初步完成")
for i in range(3):
diff_result = bh_utils.query_data(
f"select a1.reqId,a1.prompt,a1.`rank` from (select reqId,prompt,`rank` from geo_third_task_log where pt = {pt} and cn = {cn}) as a1 left join geo_third_task as a2 on a1.reqId = a2.reqId where a2.reqId is null")
if not diff_result:
logger.success(f"[pt={pt} cn={cn}] 第{i + 1}轮检查已无缺失")
break
for row in diff_result:
row["thinking_enabled"] = "1"
values = [json.dumps(row, ensure_ascii=False) for row in diff_result]
redis_client.sadd("mt_third_task", *values)
logger.success(f"[pt={pt} cn={cn}] 缺失数据开始补充")
wait_mt_tasks_done(redis_client)
logger.success(f"[pt={pt} cn={cn}] 缺失数据补充完成")
time.sleep(120)
logger.success(f"[pt={pt} cn={cn}] mt_third_task 已为空,开始汇总导出/发送")
# 1) 解析原始 json
result = bh_utils.query_data(f"select * from geo_third_task_log where cn = {cn} and pt = {pt}") or []
for i in result:
req_id = i.get("reqId")
if not req_id:
continue
file = f"geo_snipaste/{pt}/doubao/{req_id}/text.json"
try:
raw = tos_utils.get_string_from_tos(file)
if not raw:
continue
content = json.loads(raw).get("content")
if content:
doubao_process_original_data(file, content)
except Exception as e:
logger.success(f"[pt={pt} cn={cn}] 处理 {file} 失败: {e}")
# 2) 导出 Excel
out_path = to_excel2(pt, cn)
# 3) 上传 + 发飞书
file_name = os.path.basename(out_path)
file_key = upload_file(out_path, file_name)
if file_key:
chat_message(file_key)
logger.success(f"[pt={pt} cn={cn}] 已发送飞书文件: {file_name}")
else:
logger.success(f"[pt={pt} cn={cn}] 上传失败,未发送飞书消息")
# =========================
# =========================
def scheduler_thread():
"""
永远保证 23:00 生成“第二天任务”(不会因为 runner 跑太久被阻塞)
"""
while True:
wait_until_time(SCHEDULE_HOUR, SCHEDULE_MINUTE, SCHEDULE_SECOND)
tomorrow = datetime.now().date() + timedelta(days=1)
pt_tomorrow = tomorrow.strftime("%Y%m%d")
try:
init_task_once(pt_tomorrow)
except Exception as e:
logger.success(f"[scheduler] init_task_once 失败: {e}")
# 防止同一分钟内重复触发(例如 sleep 被中断)
time.sleep(2)
def runner_thread():
"""
- 程序启动后先跑一次“当天 pt”的 cn0->cn1->cn2
* 同一天多次启动:每个 cn 只会跑一次(靠 redis lock_key: pt+cn)
- 之后进入跨天循环:每天 00:00 + delay 后再跑一次(daily 不去重)
"""
r = init_redis4()
def run_once_for_pt(pt_today: str):
for cn in (0, 1, 2):
try:
rows = bh_utils.query_data(
f"select reqId,prompt,rank from geo_third_task_log where cn = {cn} and pt = {pt_today}"
) or []
for row in rows:
row["thinking_enabled"] = "1"
values = [json.dumps(row, ensure_ascii=False) for row in rows]
if values:
r.sadd("mt_third_task", *values)
process_cn_after_tasks_done(pt_today, cn)
except Exception as e:
logger.success(f"[runner] pt={pt_today} cn={cn} 执行失败: {e}")
continue
# ========= 1) 启动立即跑一次(按 cn 去重) =========
try:
pt_today = datetime.now().date().strftime("%Y%m%d")
for cn in (0, 1, 2):
# 关键:锁 key 加上 cn
lock_key = f"mt:runner_startup_ran:{pt_today}:{cn}"
# redis 不可用:就直接跑(会重复,至少不中断流程)
ok = (r.set(lock_key, "1", nx=True, ex=RUNNER_LOCK_EXPIRE) if r else True)
if ok:
logger.success(f"[runner] startup run for pt={pt_today} cn={cn}")
# startup 只跑一个 cn:把 run_once_for_pt 拆开跑
try:
rows = bh_utils.query_data(
f"select reqId,prompt,rank from geo_third_task_log where cn = {cn} and pt = {pt_today}"
) or []
values = [json.dumps(row, ensure_ascii=False) for row in rows]
if values:
r.sadd("mt_third_task", *values)
process_cn_after_tasks_done(pt_today, cn)
except Exception as e:
logger.success(f"[runner] startup pt={pt_today} cn={cn} 执行失败: {e}")
else:
logger.success(f"[runner] startup already done for pt={pt_today} cn={cn}, skip")
logger.success(f"[runner] pt={pt_today} all finish")
except Exception as e:
logger.success(f"[runner] 启动跑一次失败: {e}")
# ========= 2) 之后按原逻辑:跨天再跑(每天跑一次,不去重) =========
while True:
today_date = datetime.now().date()
tomorrow_date = today_date + timedelta(days=1)
wait_until_date(tomorrow_date)
time.sleep(AFTER_MIDNIGHT_DELAY_SECONDS)
pt_today = datetime.now().date().strftime("%Y%m%d")
logger.success(f"[runner] daily run for pt={pt_today}")
run_once_for_pt(pt_today)
time.sleep(60)
logger.success(f"[runner] pt={pt_today} all finish")
def run_once_for_pt(pt_today: str):
r = init_redis4()
cn = 0
try:
rows = bh_utils.query_data(
f"select reqId,prompt,rank from geo_third_task_log where cn = {cn} and pt = {pt_today}"
) or []
values = [json.dumps(row, ensure_ascii=False) for row in rows]
if values:
r.sadd("mt_third_task", *values)
process_cn_after_tasks_done(pt_today, cn)
except Exception as e:
logger.success(f"[runner] pt={pt_today} cn={cn} 执行失败: {e}")
def send_task(promp_list,pt,count):
data_list = []
for cn in range(count):
for index,p in enumerate(promp_list):
req_id = str(uuid.uuid4())
data_list.append({
"reqId": req_id,
"prompt": p,
"platform": "DB",
"cn": cn,
"pt": pt,
"rank": index
})
logger.success(f"[init_task] 已写入任务: pt={pt}, prompts={len(promp_list)}, rows={len(data_list)}")
bh_utils.insert_data("geo_third_task_log", data_list)
def webhook_snipaste(promp_list):
count = 1
pt = datetime.now().strftime("%Y%m%d%H%M")
tos_pt = datetime.now().strftime("%Y%m%d")
send_task(promp_list, pt, count)
r = init_redis4()
for cn in range(count):
rows = bh_utils.query_data(
f"select reqId,prompt,rank from geo_third_task_log where cn = {cn} and pt = {pt}"
) or []
for row in rows:
row["thinking_enabled"] = "1"
values = [json.dumps(row, ensure_ascii=False) for row in rows]
if values:
r.sadd("mt_third_task", *values)
logger.success(f"[pt={pt} cn={cn}] 等待 mt_third_task 跑空...")
wait_mt_tasks_done(r)
logger.success(f"[pt={pt} cn={cn}] mt_third_task 已为空,初步完成")
for i in range(3):
diff_result = bh_utils.query_data(
f"select a1.reqId,a1.prompt,a1.`rank` from (select reqId,prompt,`rank` from geo_third_task_log where pt = {pt} and cn = {cn}) as a1 left join geo_third_task as a2 on a1.reqId = a2.reqId where a2.reqId is null")
for row in diff_result:
row["thinking_enabled"] = "1"
values = [json.dumps(row, ensure_ascii=False) for row in diff_result]
if not values:
logger.success(f"[pt={pt} cn={cn}] 第{i + 1}轮检查已无缺失")
break
r.sadd("mt_third_task", *values)
logger.success(f"[pt={pt} cn={cn}] 缺失数据开始补充")
wait_mt_tasks_done(r)
logger.success(f"[pt={pt} cn={cn}] 缺失数据补充完成")
time.sleep(180)
logger.success(f"[pt={pt} cn={cn}] mt_third_task 已为空,开始汇总导出/发送")
result = bh_utils.query_data(f"select * from geo_third_task_log where cn = {cn} and pt = {pt}") or []
for i in result:
req_id = i.get("reqId")
if not req_id:
continue
file = f"geo_snipaste/{tos_pt}/doubao/{req_id}/text.json"
try:
raw = tos_utils.get_string_from_tos(file)
if not raw:
continue
content = json.loads(raw).get("content")
if content:
doubao_process_original_data(file, content)
except Exception as e:
logger.success(f"[pt={pt} cn={cn}] 处理 {file} 失败: {e}")
query_list_log = bh_utils.query_data(
f"SELECT DISTINCT reqId FROM geo_third_task_log where cn = {cn} and pt = {pt}"
)
uuid_list = []
for i in query_list_log:
uuid_list.append(i.get('reqId'))
uuid_str = ",".join([f"'{uuid}'" for uuid in uuid_list])
query_list = bh_utils.query_data(
f"SELECT DISTINCT reqId,prompt, platform,insertime,rank FROM geo_third_task WHERE reqId IN ({uuid_str}) order by rank asc"
)
task_list = []
for i in query_list:
task_list.append(
{
"reqId": i.get("reqId"),
"prompt": i.get("prompt"),
"platform": i.get("platform"),
"insertime": i.get("insertime"),
"rank": i.get("rank")
}
)
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
}
def build_row(i):
reqId = i.get("reqId")
prompt = i.get("prompt")
platform = i.get("platform")
insertime = i.get("insertime")
rank = i.get("rank")
file = f"geo_snipaste/{tos_pt}/doubao/{reqId}/text.json"
context_path = f"geo_snipaste/{tos_pt}/doubao/{reqId}/context.txt"
quote_path = f"geo_snipaste/{tos_pt}/doubao/{reqId}/quote.txt"
think = f"geo_snipaste/{tos_pt}/doubao/{reqId}/think.txt"
png_url = f'https://tcdn.aidso.com/geo_snipaste/{tos_pt}/doubao/{reqId}/png.png'
text_json_str = tos_utils.get_string_from_tos(file) or "{}"
return (
prompt,
plat_form_map.get(platform),
tos_utils.get_string_from_tos(context_path),
tos_utils.get_string_from_tos(think),
tos_utils.get_string_from_tos(quote_path),
json.loads(text_json_str).get('share_url', ''),
png_url,
timestamp_to_datetime(insertime),
get_context(png_url),
rank
)
batch_data = [None] * len(task_list)
with ThreadPoolExecutor(max_workers=50) as executor:
future_map = {
executor.submit(build_row, i): index
for index, i in enumerate(task_list)
}
for future in as_completed(future_map):
index = future_map[future]
try:
batch_data[index] = future.result()
except Exception as e:
logger.exception(f"导出数据失败 index={index}, task={task_list[index]}, err={e}")
batch_data[index] = None
# 去掉失败的数据
batch_data = [row for row in batch_data if row is not None]
logger.success(f"时间{pt}--{cn}---导出数据{len(batch_data)}条")
out_path = os.path.join(EXCEL_DIR, f"{pt}美团{cn}.xlsx")
excel_writer = ExcelWriter(out_path)
excel_writer.write_batch_rows(batch_data)
excel_writer.close()
file_name = os.path.basename(out_path)
file_key = upload_file(out_path, file_name)
if file_key:
chat_message(file_key)
logger.success(f"[pt={pt} cn={cn}] 已发送飞书文件: {file_name}")
else:
logger.success(f"[pt={pt} cn={cn}] 上传失败,未发送飞书消息")
# =========================
# 主入口
# =========================
if __name__ == "__main__":
# t1 = threading.Thread(target=scheduler_thread, daemon=True)
# t2 = threading.Thread(target=runner_thread, daemon=True)
# t1.start()
# t2.start()
# t1.join()
# t2.join()
redis_client = init_redis4()
print(redis_client.delete('mt_third_task'))
\ No newline at end of file
# -*- coding: utf-8 -*-
import time
import redis
import requests
import json
from aidso_geo.core.commit_process import redis_client8
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import tos_utils, bh_utils
from aidso_geo.config.base_config import init_redis
redis_client = init_redis()
def commit_task(platform):
url = "http://172.200.5.67:8086/api/geo/task_commit"
payload = json.dumps({
"prompt": "羽绒服品牌推荐",
"brandWords": [
"抖查查",
"哎搜"
],
"comWords": [
"蝉妈妈",
"考古加"
],
"taskId": "all_test_task1",
"reqId": f"all_test_req1{platform}",
"platform": platform,
"type": "stream"
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get('reqId')
def get_result(req):
url = f"http://172.200.5.67:8086/api/geo/task_check?reqId={req}"
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.json())
import openpyxl
from openpyxl import Workbook
import os
class ExcelWriter:
"""
Excel写入工具类,专门用于写入taskID和response两列数据
"""
def __init__(self, file_path):
"""
初始化Excel写入工具
:param file_path: Excel文件保存路径(如: './output.xlsx')
"""
self.file_path = file_path
self.workbook = None
self.worksheet = None
# 初始化工作簿和工作表
self._init_workbook()
def _init_workbook(self):
"""初始化工作簿和工作表,若文件已存在则打开,不存在则新建"""
# 检查文件是否存在
if os.path.exists(self.file_path):
self.workbook = openpyxl.load_workbook(self.file_path)
# 取第一个工作表
self.worksheet = self.workbook.active
# 检查表头是否存在,不存在则添加
if self.worksheet.cell(row=1, column=1).value != 'taskID' or \
self.worksheet.cell(row=1, column=2).value != 'response':
# 在第一行插入表头
self.worksheet.insert_rows(1)
self.worksheet.cell(row=1, column=1, value='taskID')
self.worksheet.cell(row=1, column=2, value='response')
else:
# 新建工作簿
self.workbook = Workbook()
self.worksheet = self.workbook.active
# 设置表头
self.worksheet.cell(row=1, column=1, value='prompt')
self.worksheet.cell(row=1, column=2, value='platform')
self.worksheet.cell(row=1, column=3, value='context')
self.worksheet.cell(row=1, column=4, value='quote')
self.worksheet.cell(row=1, column=5, value='think')
self.worksheet.cell(row=1, column=6, value='time')
def write_batch_rows(self, data_list):
if not isinstance(data_list, list) or len(data_list) == 0:
raise ValueError("data_list必须是非空的列表")
# "taskId": i.get("taskId"),
# "prompt": i.get("prompt"),
# "platform": i.get("platform"),
# "insertime": i.get("insertime"),
# 找到下一个空行
next_row = self.worksheet.max_row + 1
# 批量写入数据
for idx, (prompt, platform, context, quote,think,time) in enumerate(data_list):
self.worksheet.cell(row=next_row + idx, column=1, value=prompt)
self.worksheet.cell(row=next_row + idx, column=2, value=platform)
self.worksheet.cell(row=next_row + idx, column=3, value=context)
self.worksheet.cell(row=next_row + idx, column=4, value=quote)
self.worksheet.cell(row=next_row + idx, column=5, value=think)
self.worksheet.cell(row=next_row + idx, column=6, value=time)
# 保存文件
self.workbook.save(self.file_path)
def close(self):
"""关闭工作簿,释放资源"""
if self.workbook:
self.workbook.close()
import time
from datetime import datetime
def timestamp_to_datetime(timestamp, fmt="%Y-%m-%d %H:%M:%S"):
timestamp = int(timestamp)
# 方法1:用datetime模块(推荐,更直观)
local_time = datetime.fromtimestamp(timestamp)
# 格式化时间
return local_time.strftime(fmt)
def process_batch():
query_list = bh_utils.query_data("""
select reqId,
prompt,
brandWords,
comWords,
taskId,
platform,
type,
thinkingEnabled as thinking_enabled,
searchEnabled as search_enabled,
comWordsMap
from geo_commit_task
where status = 'ING'
and type = 'stream_batch'
""")
print(len(query_list))
for i in query_list:
if i.get('comWordsMap'):
i['comWordsMap'] = json.loads(i.get('comWordsMap'))
if i.get('brandWords'):
i['brandWords'] = json.loads(i.get('brandWords'))
if i.get('comWords'):
i['comWords'] = json.loads(i.get('comWords'))
redis_client.lpush(f"{i['platform']}:geo:{i['type']}:list", json.dumps(i))
# redis_client.lpush(f"{i['platform']}:geo:stream_batch:list", json.dumps(i))
def to_excel(file_name):
query_list = bh_utils.query_data(
f"select * from geo_commit_task where prompt = '经济纠纷哪个律师比较厉害' and platform in ('DPA','DB','DOUBA','DP') ")
task_list = []
for i in query_list:
task_list.append(
{
"taskId": i.get("taskId"),
"prompt": i.get("prompt"),
"platform": i.get("platform"),
"insertime": i.get("insertime"),
}
)
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
}
batch_data = []
for i in task_list:
taskId = i.get("taskId")
prompt = i.get("prompt")
platform = i.get("platform")
time = i.get("insertime")
context_path = f'geo/{taskId}/{platform}/context.txt'
quote_path = f'geo/{taskId}/{platform}/quote.txt'
think = f'geo/{taskId}/{platform}/think.txt'
if tos_utils.check_file_in_tos(context_path):
batch_data.append(
(
prompt,
plat_form_map.get(platform),
tos_utils.get_string_from_tos(context_path),
tos_utils.get_string_from_tos(quote_path),
tos_utils.get_string_from_tos(think),
timestamp_to_datetime(time)
)
)
excel_writer = ExcelWriter(f"/Users/yaowentong/Desktop/{file_name}.xlsx")
excel_writer.write_batch_rows(batch_data)
excel_writer.close()
print("数据已成功写入Excel文件!")
def get_req_id():
url = "https://openapi.aidso.com/openapi/ywt/geoReqList?phone=18900000010&begin=2026-03-10 00:00:00&end=2026-03-10 23:59:59"
payload={}
headers = {
'Authorization': ''
}
response = requests.request("GET", url, headers=headers, data=payload)
return response.json().get('data')
import csv
def to_csv():
query_list = bh_utils.query_data(
"select * from geo_commit_task where reqId in ('402e0366-415f-46b4-8a5e-be270679d5c0','764554d3-148b-420e-86a2-40dd83d062ba','f761258d-2372-469a-bda3-1cb3b881c4a0','e78416ce-5c83-4a85-a18c-32e4916e9250','37ae17aa-270e-4a32-97b6-86aa691d57a6','bea11246-12d5-40f3-8a70-e71d0074862a','04d9dcea-8ca7-4a1c-8196-f91ae8fd0633','c2d1f6b4-aa13-4827-994c-19820e90f5ae','df0d72b4-8d43-4761-bf8f-40dfe2516943','5bda7418-ab5e-46b4-9e36-6d0a770201de','f08d91b9-58c5-4a21-a001-0b5c6e13bae1','dcaaced7-9259-40e9-b23d-7d62b46139ef','032cf0e5-92a0-448f-ab5f-5986e5cb16b3','f9ae9c57-c3f3-4446-be78-9f8af2ff5270','fa28ef1c-ca4b-4d86-a1d2-7d0f8cb8a0ca','d9d5001b-f5da-49e4-be58-d722d671fabb','3b490bcb-002e-486d-9e1e-a2058879dd4e','e12ad0f7-3b60-439a-a322-6d8012f3a742','73d31139-3b35-41eb-9550-848635c865c8','bda4126c-b392-4c0b-a2e9-fc2e28af2b46','344e96bf-fd18-4218-9367-0f723a1aa4ee','9a76d464-e628-46c9-bd69-6ea6a503e312','3935c16d-5eea-4370-ac3c-bba3ff95119f','b458576b-dda4-4aaf-bf72-70325f509b92','a0e28647-ddaf-426d-b1e3-f32cc00dd54a','1d28f8f7-9e94-47e5-90e8-85b6970e4699','5b6eaaa9-927a-4a4a-9ef1-0e85f3eefe7a','5b774d87-a3b1-4b31-b702-dd7681460127','1449ab66-16c0-4a68-8106-2c11dbf63d0e','9149ec81-df0c-4f96-94aa-9f5f2b5c7629','65db6517-1b89-4cff-8305-cde546442ad1','bb8f2c88-df8c-4634-95d0-ed04d710f1c6','4a3c21e9-dd9b-48da-8e4d-697216434b4f','d4d68a65-c0dc-46f4-92a7-970be2935d6b','0bbbded6-a1cc-4d3c-82da-48533cf9976c','0f54581f-7d98-4d9f-a1d2-66da6cc6d5a9','43640355-f350-42fb-9d02-d70ffeedfdc8','8a990c73-13c8-4669-aa36-79553decca0c','02a18ed9-e22c-4ed7-a3b2-31e4349de211','c5f20520-e3e2-4b9a-9e29-bdb3d0defc1a','e816140e-6c19-477b-a100-97674b619cf2','001aa067-3444-49e7-8358-b71e4186393f','de784e81-08d6-425b-8375-7d0a2d83313d','b5f279a5-d616-4aea-a757-139090f147a4','a8ee8ff9-3744-4d7f-af8a-a0f63c415455','88644b3f-090d-45f1-9be1-a385b207ecda','6270cd9f-1786-4dd0-9f2d-66e7a73b6a00','661feb8c-eca4-49a0-a654-e8963e86cec5','1cc7dce2-b853-49b0-9d76-c8ef2c8f60e2','7ec90953-8be1-4d7d-a4b6-7925675564b8','c5b0297d-a296-4fff-a9f5-2f18324f6393','bd35adfa-dc9a-4bfb-bb5b-ea1d8723c38b','a117e7fd-6438-4309-a8e5-43b7203fcb13','b9d5931a-af54-41e8-b632-d1a75bb845e9','6d04a745-cfa3-4b69-b9bb-6e153fbcbde3','5a0e6049-a195-4bef-aa6e-6fc4e5cd3237','4c26570f-e217-4a08-97d2-2962cc535278','14cdc028-792a-4e14-b6a4-d43eac4fad3e','027e1685-df78-4820-93a1-aa68921ccaa4','51792899-0d97-450f-8828-38308702117c','fc96735e-9a62-4bea-8045-bbe6e13b9ce8','8ebd2872-e80d-48d5-935b-410a56352d68','38015432-15a2-42d9-86aa-3f7378501749','a4fa61e9-4b5c-4d37-839c-53fa49900d90','960914c9-8964-4d04-98be-61a8540aeaf8','a2c2cb76-ac37-4ee8-8d1c-2fe2351c6e6d','71b4811f-6d9e-4d57-941b-3795ed13685a','2147559e-bb61-43dd-8c9f-665c53d65f7f','a2153afa-7f13-453e-adc9-c88d1acbad91','a26d4823-1ff6-4b9f-b605-d8a652a5803f','425bc4bd-7862-4a9e-a54d-e5021f2410d0','7988a5ab-5d4b-444e-ae65-aacc10778cdb','60c5acee-f379-4b5a-9984-22e9647c9c41','fb01506a-da7d-4c5d-bfa6-41c88bcde201','b5ca42c7-4153-4976-a45e-669168b4f10f','dfeac8c6-2694-495a-bebf-f65cdd4a3cdf','4d759901-3df3-48ee-8301-9cd2f701102c','85446bdb-78b7-4124-b4bf-662f3a56d7c4','d9c5dbf1-7a85-4a5e-855f-422dc3242505','7d96a681-0a2b-4d9e-a3b4-776d12df61f7','e6a44084-65bf-489a-9f13-a39e7a75e52e','2d0b76fd-7b10-4cc5-8b85-6bf75d3c2176','2076564f-f3c0-475d-b03d-87a726e8b2e8','656be388-a590-4a89-b7f0-a93e9beb09e2','2d6f0298-fd1e-493f-bb33-3c7b4b126094','21b9f675-1c98-497f-aebb-545b64aaae5f','00367916-4f52-4a0d-bdac-a480fa15f5dd','67ec62d2-9b64-4040-8176-1d51b7501933','68ce2fa4-79a6-4ce5-987a-50f8a11a55c9','e32c763c-e382-4b2e-a1fd-20267db7535d','856ac9cc-4e01-4609-8da7-f69dadc0ec01','f24b7fa3-edd9-4380-868e-436873153628','9dcd94e4-add7-4738-93d8-cb04ae135cdf','bf58bc69-7022-43c6-b7ae-e50b7bc8e3b3','0ab321f8-8104-4cdb-9eda-d3910e828f78','f633be11-21b3-465f-9152-df80eb0aa7f5','0d2e9e81-7187-40f8-880f-d443fdf3bba1','4f517d93-573f-4490-b797-d569939fe93c','daca5655-e7e7-445e-999c-8b95c748c26d','5b98ba7e-d012-4297-9f1d-060c5f03ab20','2f13a859-dc2a-4ffe-8a90-644a3bb200fd','977492ff-923e-40b8-8789-c96a41a3040a','103c4022-48ce-4821-b063-22f79490a094','7f1986d0-84d8-4155-b561-5570c0db2f3e','1af9153b-0fe4-4c60-a906-73414622cbbe','719085c7-5e76-4ab3-b1dd-c837342ab56c','e129687d-003b-43be-a2ad-4b00beb34742','09394f10-bf38-4176-b0e5-ba14faa209bc','b5e775ab-f12e-40b6-9c17-e6c06cd9fd73','96ce8651-ce30-4e08-9747-fb2d18738afd','e8931f38-302e-4ec2-a99f-777f8262b0fd','b566da9d-25f6-4ac9-82f4-12ad66b59169','898550c7-f9b3-4072-8381-3f98cdabe447','385f6213-f427-4e05-8c55-e21cc13aedac','9346b336-b594-40eb-b032-f389f83cf2ec','9197e1c9-e115-4ec0-b934-e8090c4d8294','fbbb1fa7-d009-4b87-90b6-5fc9b061460f','03c52d6f-6694-4634-a6a3-76a110b2f75e','7f901ea0-b429-4c70-87ec-4849c0f241be','ce553bf9-84ad-475b-9208-65d727587811','e7cabdae-3bbd-4ff6-8b20-f54c7b10cf4e','f6e250ac-74f3-4665-a085-17d2272ef8a7','c18629ff-225e-42b8-a0e3-db4b614c361c','94b69431-3122-4bc6-8b59-0fd965ca8a55','57343d82-4701-49f5-b2f4-95776a5e491a','396159bf-4732-4a72-93f8-98b41671935f','6270d631-cfac-4fb7-a3cc-a5239c167369','8d9534a6-2262-4b41-a397-1cb5109c1e4d','00c0727f-bcf9-43cf-b827-c1cbe8fd4f89','8e4b4498-d52a-4dab-a3c9-008f5c36bb54','38a0b7c0-7d89-4508-9e93-b92146f0f37f','614daa6c-af87-40bd-83db-f48fc5e6b947','e6d331ec-f3a1-4eb4-a0c7-678893c6336d','1d5f705e-13da-402f-9ef4-fe6223534304','8ab68210-44da-42da-b66f-a9980baed8d6','71342621-08cc-436a-8b7e-d2b362009d6b','59f95902-93ff-467a-8087-1fb358ce6893','e3f67cc0-9fce-4102-80b8-1a6eb55d623b','9a8c9969-16a3-4ba1-8fe1-d981715ec882','11cf203c-ba8b-4794-87f4-df41edd82d4d','a0a6805e-2776-42ee-9185-3d55042c158c','6f67ec51-cc55-4f1e-b9df-5803c1b96104','72e96887-0bf8-4a7a-ad96-9ce37a460c61','9afe5f96-295b-487e-b120-d9688a92eae6','98540dbe-cb8e-4549-a57e-e6f5bd4ab705','1e8a95a5-01cf-4cb5-86d8-afffe27689e6','cdae4810-9ce1-4ac7-b820-0cc678b2182f','10c7d292-fa7e-4b68-a746-ca45f6da6967','78521cef-f7d8-4d37-b104-fcad04113244','78cfc55c-29c0-4c02-be86-41ec48f15026','5e737176-8fe8-48c9-8732-4bd75e274869','8e323e60-1784-4274-adea-191cd75b77c5','a343d700-ae56-4b3a-b71b-9f11a09e0f59','ae54a8dd-6348-4b2f-80d2-20143732f28c','b550b730-4848-490a-acdd-075c787e7f61','217fad3b-c0ad-4d17-b334-9f7d5b17d02c','899157b6-672c-4280-af5d-b48b78437d11','98147978-f334-4ff5-8e32-3389ce495756','d08a0c3d-522c-4fe9-8d8a-ddffdb04bfa7','32720b8a-abe9-4833-913b-66b9e6821d72','42056a6d-9fb1-4d71-81af-5b5eb653b427','a37fb18a-3a61-4843-95c2-9598d5f3303d','b76f8068-9f94-431a-8398-7cadb933c0d9','c400c60a-f5dc-4879-b103-a7b67440d5c3','4d4bbc07-6fa9-490f-aa0f-1958247650f2','c8e1f75d-d882-42ee-88a9-e89f2da85700','442732cc-76b3-444c-9af4-c410adb3570a','065d4f3a-098c-437a-912f-2d3020ef2950','ef3ebb13-a965-40d4-90c9-e2109c532e60','6b44c62d-dfc2-4f2f-8658-35b1bc977739','fddccf26-d822-4c23-baeb-f9c98a5bb199','bcd6049e-9853-4bd8-a9b3-4f783db1b0b2','1d7c519c-0b2c-46ef-a29c-c09cde85bac3','519ccbe6-e38e-4f02-bfbc-6bed858207e7','43d45a2d-82ac-49dc-9117-93cc48efb7eb','55bddac9-92b0-4518-b278-2f90cde0cb22','18e59a0c-5426-43cf-9e72-9bc712ced6d7','b33e4dff-2527-4b14-881f-cbd63819268b','4ef0e57c-ffa9-4d36-8127-4fcddfdbc389','dcbeef9c-8b0e-46cb-ba82-88486f4395ff','758013c5-73a8-42cc-9308-be027d5345d9','d466aee5-c550-4cb6-af7d-5e1bfa8d5f6f','0d032268-910d-4643-ab06-366741aaef8d','1b62afdd-cabb-4fae-8513-03d0e35cedb0','83da0acd-0db1-462f-aa7e-735140e812aa','3167562b-8460-4b72-b4b2-497a77a7a6cf','32df6d21-8663-42d9-81b7-deee4a009364','0d679eca-3c07-457f-9222-876e9bb084d7','489ccce8-0a57-4505-9cad-362c03df4f26','8226381d-9db5-497b-bbd0-fc14dd8b38d9','42bf91dd-8eb1-47a2-8caa-a878bb278243','85532823-9832-419e-ae17-8204d763b24a','67e0c670-8059-4d80-8d54-4dcfd31ba497','94d54f8a-cd38-43ca-b014-7a16fe11b180','f904ad94-0abf-4d51-9c66-6d49a2daccf2','2d55dc16-4229-4180-8c03-598641929495','1f548c43-bdc6-4e36-aa90-a889c9fd2e60','66db16dd-5fb1-4da3-a11b-e99a4632dc80','158fb147-3068-41a9-8cdb-e80c53e9d07a','b7b5e7b3-09ba-4941-b760-54b95d76c500','b9bb6409-cb61-4295-bcf6-c23956634a13','192c1924-0cb5-4e6f-abd7-c39608393a71','198f6cb9-23aa-47d5-a9b8-1334e8047e0e','5d6a29ed-f088-40cc-a828-798b9481ec5e','f6f1882a-5a2a-413a-bcc8-15c5a30c5665','00cf8877-321a-4908-a63c-ea748ec4c7b2','7275a6e4-7c1e-480d-88fa-68f59a7066c2','515b6157-40b4-4092-bbd2-89c83dc6ee63','5ba1e3cb-e7f9-427e-8ce8-ed7eeda37397','39c7ce59-a0f5-4cd1-9645-4aa9212ef96d','2dc5d866-107a-4f7e-8514-9ed2bdf35543','b3bd0a0f-985c-4e5d-ae96-ffa0c4beb16f','05cc4549-97de-4ee1-a4e4-4a6c4d8e3b13','c3b72930-468f-44d0-b522-598bc4db0fa8','0a89723e-3733-4031-b9cc-6bbba105fb45','39282ff2-ca9e-4039-905f-9146f71bc363','e12dfb28-5fff-4a60-adba-8979333bb37d','79af884e-f3f8-46b3-96e1-c603e543dcd2','de3bb456-667e-4ee4-bd13-1c2e0dee31cb','049c48e4-8910-4bb9-af63-a5e223aae60a','bc836645-650c-41d3-8fe7-7be54a67260d','a03ac9a9-385c-48a7-8e9d-1682f7476086','993a7ce1-6db4-4ba5-a8e9-a89ec7090aac','48bbf7cd-2c23-4a05-8e7b-b284c6ff9072','5155e653-01da-4e07-b8fc-b2b8cbf370b4','44216d61-62e9-43c4-a81f-456a788500f5','a9467f8f-4a60-4467-a232-a697ce73bdf5','8aba38e4-44a4-4079-a9fe-ac36e9f5ace8','d781e2e1-2d63-45cf-946a-219f86996314','1a4f306b-f889-477b-9c76-f6e131e72748','bd171dd3-0ecc-4b7d-911b-703345fa7d03','e191cacf-e5ff-4dc4-84db-a2a9952434b9','f4dbceaf-5313-47e0-9643-acf2a9a69952','18363a0b-e51f-4ae2-a726-15210aca2c32','f8f7e9e0-3cd3-4311-962b-5f45d5318175','83d2ea2a-5ae8-4608-8aa2-8023fcd5dfcb','4aaba832-1711-489f-9e91-3bd3ab49ec78','9208c6e4-0fbe-4597-8db3-8b641a27ede6','4bcfe7c6-fa8c-4d43-8427-92b2c6db242e','9932df53-8dcc-4160-b576-3da316b094f4','d55bc555-c299-4072-b535-b469ddf8ea9b','cfb23850-8476-4429-8b7d-8dd5386c7ad4','71b4750f-8f90-4841-8cfb-da5796729b33','7ced1030-f0fe-4c07-9623-443134159f52','0142509c-b715-4d1e-a897-99c7f68e1f72','8f64327a-7bf4-4cbc-a0cf-94c10944e8dd','ef49cdc0-6417-4421-b394-7ee31ff1de19','fe194aac-1576-47c7-8cb2-9634067613d5','3a5bdd7a-90de-46bc-8d4d-d8592eac7a8f','a1a31039-bd3b-4eef-8d12-a00422be8b34','04e54df4-811a-452d-a937-28da81fcc57e','5f54e4a8-597b-44de-9cdd-e677bec40b5f','0e4e9035-89a7-4b3d-8300-6ae82f329f99','9a4bdc4a-38b2-4341-acc8-88614863471c','24deb343-baec-4546-978f-d82daa2a84ac','0ad4128c-8bef-4506-95de-0ea10d6b3777','9a1b465d-07f8-4e6d-a353-96fbfc2ab319','4957f366-3893-4e27-9fc2-14801ad5c4ef','14cf293b-b5d4-4228-a54a-249558502b1c','1a580eec-31a2-44d7-8947-e166e8785fdd','b853a74e-f897-4943-9d70-fc6ae46eacc5','eff899e6-d27e-40fd-bad6-0bd89716f136','7ae93cbb-c50e-471d-8607-511c11695e3b','6b8a6e25-ee7c-4158-b2bd-688a3adaee19','3fca39b5-88b3-4ad6-8782-88f969df3906','a00555e0-c35f-485d-ba34-e59ce065b659','b970689a-b7e1-489d-94b9-31a901dae834','55faec72-5f02-40e7-81a3-a8bfbebf556f','8e8c354f-fd9c-4fe4-a55e-414060586e25','aaf9e60d-8d3e-426b-94b5-f052860d10b8','730a9da6-da00-4eb1-b8d7-34b1f226b654','341fa26d-ce5a-4763-9ca2-31104741659b','ca2a39f8-1281-4a42-a07d-dbf77c89c5b1','8cab18e1-5b16-4692-a272-eaa7cf814534','79bb416a-acc7-40f1-8699-7a57e7861d05','8c6fa91f-18ba-4e8a-b29b-1fe30414ab1f','ba6a410a-e746-4447-90a9-b7699f67304d','51826d85-4f83-4dd6-87c2-c754e8deb2f9','b057359d-41a3-46ce-88de-473da52d9db3','dda83cc1-23aa-49ef-b3fc-9b1f94342005','bf50de19-bb93-415f-8fb9-0db610c998ed','22012d9f-8405-4873-8814-784a1df3e0c6','c437bc38-31c2-47b9-8d00-cdbda2a1c6c5','2c743e2e-d80b-4b9e-8bc2-932a7e4c756c','c855307e-134c-41fe-99d0-7a650d81dc3b','b8da3a75-8d0b-4745-b03d-0fb87472ae48','046fc7a4-7fd0-4513-ac06-3006de1b614a','b2f5685e-d7a8-4e63-8d21-8ca0aee75f5c','c1473bdb-1ba4-45fa-9f81-14daf32fddb6','7a837f2e-429e-4bee-9abb-f2440a5a0eb3','25cbe593-c7d8-42cb-919c-4ac0b0af3f1a','b3300982-a0e0-4d31-9fa7-ee51ac49b68b','4df7a2f5-4610-43b3-a344-f6fa7f461bee','34ba649e-7d3e-4e6f-9757-139394dd1d94','06f229f4-f998-4584-a704-023c168133ac','5c0c9344-ebe6-4f28-b75e-2dc74a86cfff','9edb6e55-270b-4093-a510-3c46de63e6a4','c4fb2c57-6156-47f1-915b-25995f28c432','4d86e1c7-071d-4a45-a938-0e8c6a377de9','46d6c433-7110-4a25-88f7-cb38ac068879','1696ebff-9b26-44dc-8ec7-197b4beff925','20ab1f06-bd7a-489b-9f91-c40fb5a430ac','268bd57b-f4cf-49d0-817b-65bce1206705','5a136fdb-7fdd-410d-97fe-efdfefe5da0b','20be731a-30d0-4f48-8ab3-3e3cf5a38ab4','37d2d3e6-9e81-4f1f-b373-53c744a745fb','a7afebd5-a6fc-4c23-8089-79f29e2e83b2','3b289fea-dd6f-460d-82d0-0981c7b824d9','af34ddb7-989e-4d88-9f3a-53cbe97d9ad5','635de1cb-9288-4a27-904a-97fb74b4ca6e','d66c2dd3-d404-41bf-8c1f-662c9c3ca38a','01382d0e-85ae-47eb-85ef-cf21d59f066f','c457ac2c-5d26-457c-992c-ad01069b82b0','39b7f038-81f8-4ed6-9fba-c24c1b206010','76ee2cae-9639-45a7-9dbb-002372400835','d3314ed6-07da-4e53-9331-64d6a1155265','01c16eb2-1a7f-4339-a9a7-7edfeb7652a3','f793659b-7c8a-4bb9-8140-c4eca10be7d4','b25e9f3d-d1ad-4672-86bd-094e8231a14f','2d3e3e22-f1f2-45d9-8e4f-6c4f7d9f0271','d7f84b76-66ce-45d9-839c-951ccdf98130','ae49e999-316c-4a06-9a62-6b779a8b4b16','81180b70-f631-4517-8c27-e361cae04ddf','5c516e38-77f2-42f4-b00f-966c2d4dfe77','79b6d29c-9198-4068-a5f3-c5948288c6f7','ee4178ec-25a7-4e68-8dd9-852d2075e3b9','fedc3ba5-d25d-4839-9efd-0525a6aa0f0f','ec4deeb0-afb3-4889-bd3f-f3ec8c450cff','f08156bf-8e35-4ec6-bcfa-73a808c1a160','6477db44-bcd1-4075-a7dc-a675358cebfb','a4be30d4-fa36-459b-b8c9-3622a616ca20','528107ef-7a0c-4d02-9d9a-7a42c1c03993','4d0ae58f-4cee-422a-a04e-ca31ff1aab7a','529835d1-f4ba-41ae-a225-a376c36f93ac','1fac7a2f-ccbb-409a-9c93-3fe7df28bbba','e3337410-4aa1-4105-91f4-17db3aa10350','929386bc-2a53-4cc3-b934-74d7c77c8d3b','67c309b2-fc8d-43a6-86ea-8b45b0acb554','a0e3b82b-5447-444d-8f20-a11dc36a9155','61fd794a-ba5c-45e4-bac8-699e0a6939ca','f11b2aa9-b2d8-461a-9d26-230e77c2b238','9e82fddd-ca84-4d1e-b0a9-a8e1267175cd','971979a2-3b4f-4fe2-ac07-6e7c09a898ef','5f0886ca-507e-4440-91c7-7ad870bf8365','50b1ae80-1154-49fa-8511-430afcc612e1','f4414919-7634-4ef0-80b4-a2414775367e','34c6d5c4-efa3-4281-8664-88a8ffd188b9','fd6c251c-66c3-4188-a0e0-77dc1c7ceb36','45962005-6ec2-482c-8881-7ec3a8984735','dd946f04-8908-469c-a5e6-6e731b836224','d522e7c7-6861-43ec-99dd-60e6fc708c4c','c5674c74-0436-41be-b72f-db0f4fb43083','869ff9fb-d180-4739-9857-56de6907c993','da956072-a90a-4a20-8155-33fa98706852','4b20fa14-dcee-41db-b91a-988daa0b3ded','798c10cd-f463-42b5-adf3-82709753dc2c','14b9f3d3-4a39-4907-a8f3-5f6c8b570630','bf52b1e0-8020-428d-ad1e-c150952c7265','d03d0c3a-f9be-4db8-9376-14df5957be2a','02141813-2726-408f-94e2-7ec5d4d314c7','8bbde9eb-2e3d-42e3-af1b-dee84aef2847','c7541e01-dd20-4fb5-81a9-128ea4d9250d','65371dff-444a-4c66-902b-c300685e7032','d8d05ae1-ca30-46e9-92ab-6a8f579ef1dc','da8959e9-003d-46b2-8ec1-394f12b43246','162a52ce-4576-4d5b-9914-af0bd63210a9','cadc018d-c73b-4c67-9279-0c73ab96a7bf','6dc3c118-ef78-4743-89dd-d61837e0a863','7af46d2b-921a-4b08-a967-096573568423','dac0b5dc-6589-4ae9-b979-d73b3753d2be','0da71820-12ab-4ece-8a38-825b7811bd22','d345f8ab-6ccf-4465-8130-4766d26e205c','a36ba926-3c1f-4988-bfe4-4cdcb98eb451','aa8d245c-14c3-4df6-b749-87fbba333170','d1a78352-b8cd-4944-bc68-b370405b9966','3d388a1a-5982-441c-a0a0-1d83f8f8141a','45dfa128-0280-4332-9ba0-1c6d72e71717','881a07a4-a8db-48bf-8291-300be0be0289','d8d1e9cf-0776-4364-b031-2b1d799457e6','c48b2772-4068-44a9-a3b9-0b60f9b76b34','9dddf2d9-79c5-4a5c-9d08-97d42e1c4bff','66864dac-8b01-4e87-912e-d5572ecc8257','148c22a6-5f84-4432-9e5f-43f046aac4b2','291509ed-2005-4bb3-9492-ee019034de80','bda1bcb3-a237-4d15-894e-22fdd5bc549b','120353c9-57f6-4e58-94f2-a9d35cc3f017','dcebfba0-2a48-467d-a594-ed9ba13dbca5','d6a326b3-4a2d-4f6d-b8d7-e3d09ba17efc','d35b6267-b08f-4d57-bcbd-d2635ad0bddb','4df6ff95-bb27-4d5c-80c4-f13f652bdfd6','a641f3e8-9681-43e2-9fc0-3e7645ae4d58','df289591-2b78-4c36-a914-70d751505d4b','88cbd480-7352-40a0-9129-a605ffded6f0','e5934fff-159a-4b6a-bf45-7f97af8de935','c100490d-f1bd-4365-ba2e-6eee13b39d8e','8fc50723-ec96-4451-8ed8-b62673bd0cc7','4ba1a009-1a2c-4a4a-befa-0af2a7bf0327','7f5ae93f-a533-432b-9b20-35509b8771c1','d9801f34-8f7f-40e0-bc44-ab74969a9b9f','b461f8de-7479-45bf-9276-48d871916d25') and platform in ('DB','TXYB','DOUBA','TYQW') "
)
task_list = []
for i in query_list:
task_list.append(
{
"taskId": i.get("taskId"),
"prompt": i.get("prompt"),
"platform": i.get("platform"),
"insertime": i.get("insertime"),
}
)
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
}
csv_path = "/Users/yaowentong/Desktop/欧莱雅_面霜_result.csv"
with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.writer(f)
# 表头
writer.writerow(["prompt", "platform", "context", "quote", "insert_time"])
for i in task_list:
taskId = i.get("taskId")
prompt = i.get("prompt")
platform = i.get("platform")
insert_time = i.get("insertime")
context_path = f"geo/{taskId}/{platform}/context.txt"
quote_path = f"geo/{taskId}/{platform}/quote.txt"
if tos_utils.check_file_in_tos(context_path):
context_text = tos_utils.get_string_from_tos(context_path)
quote_text = tos_utils.get_string_from_tos(quote_path) if tos_utils.check_file_in_tos(quote_path) else ""
writer.writerow([
prompt,
plat_form_map.get(platform, platform),
context_text,
quote_text,
timestamp_to_datetime(insert_time)
])
print(f"数据已成功写入CSV文件:{csv_path}")
def to_excel2():
query_list = bh_utils.query_data(
"select distinct reqId,prompt,insertime,platform from geo_third_task where insertime >1770566400 ")
task_list = []
for i in query_list:
task_list.append(
{
"reqId": i.get("reqId"),
"prompt": i.get("prompt"),
"platform": i.get("platform"),
"insertime": i.get("insertime"),
}
)
plat_form_map = {
"DP": "deepseek网页版",
"DB": "豆包网页版",
"TXYB": "腾讯元宝",
"TYQW": "通义千问",
"KIMI": "kimi",
"WXYY": "文心一言",
"BDAI": "百度ai",
"DYAI": "抖音ai",
"DOUBA": "豆包安卓版",
"DPA": "deepseek安卓版",
}
batch_data = []
for i in task_list:
reqId = i.get("reqId")
prompt = i.get("prompt")
platform = i.get("platform")
time = i.get("insertime")
file = f"geo_snipaste/geo_snipaste/20260209/doubao/{reqId}/{reqId}.json"
context_path = f"geo_snipaste/geo_snipaste/20260209/doubao/{reqId}/context.txt"
quote_path = f"geo_snipaste/geo_snipaste/20260209/doubao/{reqId}/quote.txt"
think = f"geo_snipaste/geo_snipaste/20260209/doubao/{reqId}/think.txt"
if tos_utils.check_file_in_tos(context_path):
batch_data.append(
(
prompt,
plat_form_map.get(platform),
tos_utils.get_string_from_tos(context_path),
tos_utils.get_string_from_tos(think),
tos_utils.get_string_from_tos(quote_path),
json.loads(tos_utils.get_string_from_tos(file)).get('share_url'),
f'https://tcdn.aidso.com/geo_snipaste/geo_snipaste/20260209/doubao/{reqId}/{reqId}.png',
timestamp_to_datetime(time)
)
)
excel_writer = ExcelWriter("/Users/yaowentong/Desktop/美团20260209.xlsx")
excel_writer.write_batch_rows(batch_data)
#
excel_writer.close()
print("数据已成功写入Excel文件!")
from collections import defaultdict
def regroup_by_brand(data_list):
brand_map = defaultdict(list)
for item in data_list:
brand_name = (item.get("brand_name") or "").strip()
if not brand_name:
continue
brand_map[brand_name].append({
"req_id": item.get("req_id"),
"created_at": item.get("created_at")
})
result = []
for brand_name, items in brand_map.items():
result.append({
"brand_name": brand_name,
"data": items
})
return result
if __name__ == '__main__':
...
get_req_id()
# redis_client4 = init_redis4()
# list_2 = ['想喝李山山,哪里有大额券使用']
# print(redis_client4.scard("mt_third_task"))
# query_list = bh_utils.query_data(
# "select distinct reqId from geo_third_task where insertime >1770566400")
# for i in query_list:
# req_id = i.get("reqId")
# file = f"geo_snipaste/geo_snipaste/20260209/doubao/{req_id}/{req_id}.json"
# content = json.loads(tos_utils.get_string_from_tos(file)).get('content')
# doubao_process_original_data(file, content)
# to_excel2()
# list_keyword = ['淡马茶坊怎么点更便宜','奈雪的茶怎么点更便宜','东方墨兰怎么点更便宜','馬伍旺饮料厂怎么点更便宜','Blueglass酸奶怎么点更便宜','春莱怎么点更便宜','茶话弄怎么点更便宜','陈多多怎么点更便宜','树夏怎么点更便宜','李山山怎么点更便宜','GridCoffee怎么点更便宜','阿水大杯茶怎么点更便宜','绿茶餐厅怎么点更便宜','达美乐怎么点更便宜','淡马茶坊怎么薅羊毛','奈雪的茶怎么薅羊毛','东方墨兰怎么薅羊毛','馬伍旺饮料厂怎么薅羊毛','Blueglass酸奶怎么薅羊毛','春莱怎么薅羊毛','茶话弄怎么薅羊毛','陈多多怎么薅羊毛','树夏怎么薅羊毛','李山山怎么薅羊毛','GridCoffee怎么薅羊毛','阿水大杯茶怎么薅羊毛','绿茶餐厅怎么薅羊毛','达美乐怎么薅羊毛','想喝淡马茶坊,哪里有大额券使用','想喝奈雪的茶,哪里有大额券使用','想喝东方墨兰,哪里有大额券使用','想喝馬伍旺饮料厂,哪里有大额券使用','想喝Blueglass酸奶,哪里有大额券使用','想喝春莱,哪里有大额券使用','想喝茶话弄,哪里有大额券使用','想喝陈多多,哪里有大额券使用','想喝树夏,哪里有大额券使用','想喝李山山,哪里有大额券使用','想喝GridCoffee,哪里有大额券使用','想喝阿水大杯茶,哪里有大额券使用','想喝绿茶餐厅,哪里有大额券使用','想喝达美乐,哪里有大额券使用','淡马茶坊无门槛红包怎么领','奈雪的茶无门槛红包怎么领','东方墨兰无门槛红包怎么领','馬伍旺饮料厂无门槛红包怎么领','Blueglass酸奶无门槛红包怎么领','春莱无门槛红包怎么领','茶话弄无门槛红包怎么领','陈多多无门槛红包怎么领','树夏无门槛红包怎么领','李山山无门槛红包怎么领','GridCoffee无门槛红包怎么领','阿水大杯茶无门槛红包怎么领','绿茶餐厅无门槛红包怎么领','达美乐无门槛红包怎么领','淡马茶坊能领的最大面额红包是多少','奈雪的茶能领的最大面额红包是多少','东方墨兰能领的最大面额红包是多少','馬伍旺饮料厂能领的最大面额红包是多少','Blueglass酸奶能领的最大面额红包是多少','春莱能领的最大面额红包是多少','茶话弄能领的最大面额红包是多少','陈多多能领的最大面额红包是多少','树夏能领的最大面额红包是多少','李山山能领的最大面额红包是多少','GridCoffee能领的最大面额红包是多少','阿水大杯茶能领的最大面额红包是多少','绿茶餐厅能领的最大面额红包是多少','达美乐能领的最大面额红包是多少','淡马茶坊外卖如何减免配送费','奈雪的茶外卖如何减免配送费','东方墨兰外卖如何减免配送费','馬伍旺饮料厂外卖如何减免配送费','Blueglass酸奶外卖如何减免配送费','春莱外卖如何减免配送费','茶话弄外卖如何减免配送费','陈多多外卖如何减免配送费','树夏外卖如何减免配送费','李山山外卖如何减免配送费','GridCoffee外卖如何减免配送费','阿水大杯茶外卖如何减免配送费','绿茶餐厅外卖如何减免配送费','达美乐外卖如何减免配送费']
# query_list = bh_utils.query_data(
# "select * from geo_third_task where insertime >1770480000 and prompt in ('淡马茶坊怎么点更便宜','奈雪的茶怎么点更便宜','东方墨兰怎么点更便宜','馬伍旺饮料厂怎么点更便宜','Blueglass酸奶怎么点更便宜','春莱怎么点更便宜','茶话弄怎么点更便宜','陈多多怎么点更便宜','树夏怎么点更便宜','李山山怎么点更便宜','GridCoffee怎么点更便宜','阿水大杯茶怎么点更便宜','绿茶餐厅怎么点更便宜','达美乐怎么点更便宜','淡马茶坊怎么薅羊毛','奈雪的茶怎么薅羊毛','东方墨兰怎么薅羊毛','馬伍旺饮料厂怎么薅羊毛','Blueglass酸奶怎么薅羊毛','春莱怎么薅羊毛','茶话弄怎么薅羊毛','陈多多怎么薅羊毛','树夏怎么薅羊毛','李山山怎么薅羊毛','GridCoffee怎么薅羊毛','阿水大杯茶怎么薅羊毛','绿茶餐厅怎么薅羊毛','达美乐怎么薅羊毛','想喝淡马茶坊,哪里有大额券使用','想喝奈雪的茶,哪里有大额券使用','想喝东方墨兰,哪里有大额券使用','想喝馬伍旺饮料厂,哪里有大额券使用','想喝Blueglass酸奶,哪里有大额券使用','想喝春莱,哪里有大额券使用','想喝茶话弄,哪里有大额券使用','想喝陈多多,哪里有大额券使用','想喝树夏,哪里有大额券使用','想喝李山山,哪里有大额券使用','想喝GridCoffee,哪里有大额券使用','想喝阿水大杯茶,哪里有大额券使用','想喝绿茶餐厅,哪里有大额券使用','想喝达美乐,哪里有大额券使用','淡马茶坊无门槛红包怎么领','奈雪的茶无门槛红包怎么领','东方墨兰无门槛红包怎么领','馬伍旺饮料厂无门槛红包怎么领','Blueglass酸奶无门槛红包怎么领','春莱无门槛红包怎么领','茶话弄无门槛红包怎么领','陈多多无门槛红包怎么领','树夏无门槛红包怎么领','李山山无门槛红包怎么领','GridCoffee无门槛红包怎么领','阿水大杯茶无门槛红包怎么领','绿茶餐厅无门槛红包怎么领','达美乐无门槛红包怎么领','淡马茶坊能领的最大面额红包是多少','奈雪的茶能领的最大面额红包是多少','东方墨兰能领的最大面额红包是多少','馬伍旺饮料厂能领的最大面额红包是多少','Blueglass酸奶能领的最大面额红包是多少','春莱能领的最大面额红包是多少','茶话弄能领的最大面额红包是多少','陈多多能领的最大面额红包是多少','树夏能领的最大面额红包是多少','李山山能领的最大面额红包是多少','GridCoffee能领的最大面额红包是多少','阿水大杯茶能领的最大面额红包是多少','绿茶餐厅能领的最大面额红包是多少','达美乐能领的最大面额红包是多少','淡马茶坊外卖如何减免配送费','奈雪的茶外卖如何减免配送费','东方墨兰外卖如何减免配送费','馬伍旺饮料厂外卖如何减免配送费','Blueglass酸奶外卖如何减免配送费','春莱外卖如何减免配送费','茶话弄外卖如何减免配送费','陈多多外卖如何减免配送费','树夏外卖如何减免配送费','李山山外卖如何减免配送费','GridCoffee外卖如何减免配送费','阿水大杯茶外卖如何减免配送费','绿茶餐厅外卖如何减免配送费','达美乐外卖如何减免配送费')")
# result_list = []
#
# for i in query_list:
# prompt = i.get("prompt")
# result_list.append(prompt)
#
# diff_b = list(set(list_keyword) - set(result_list))
# print(diff_b)
# to_excel("result")
# data_list = get_req_id()
# list = regroup_by_brand(data_list)
# for i in list:
# print(i)
# print('----')
# print('----')
# print('----')
# to_excel()
# process_batch()
#
\ No newline at end of file
from flask import Flask
from aidso_geo.core.routes.interface import line_app
from aidso_geo.core.routes.feishu_interface import feishu_app
from aidso_geo.core.routes.third_interface import third_app
app = Flask(__name__)
app.json.ensure_ascii = False
app.register_blueprint(line_app)
app.register_blueprint(third_app)
app.register_blueprint(feishu_app)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8086)
\ No newline at end of file
import requests
import json
import time
import os ,sys
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
from aidso_geo.utils import bh_utils
def task_commit_api(task_data):
url = "http://172.16.1.223:8086/api/geo/task_commit"
keep_fields = ["prompt", "taskId", "reqId", "platform", "type", "thinkingEnabled"]
data = {k: i.get(k) for k in keep_fields}
fields = ["prompt", "taskId", "reqId", "platform", "type", "thinkingEnabled"]
payload = {k: data.get(k) for k in fields}
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, json=payload)
try:
response_data = response.json()
print(response.text)
if response_data.get("code") == 200:
task_data["status"] = 'PROCESSING'
bh_utils.insert_data("geo_third_task_data",[task_data])
except Exception as e:
print(e)
def get_result_api(task_data):
req_id = task_data.get('reqId')
url = f"http://172.16.1.223:8086/api/geo/task_check?reqId={req_id}"
payload={}
response = requests.request("GET", url, data=payload)
try:
response_data = response.json()
if response_data.get("code") == 200 and response_data.get('data').get('status') == 'success':
task_data["status"] = 'SUCCESS'
bh_utils.insert_data("geo_third_task_data",[task_data])
except Exception as e:
print(e)
def task_commit():
data_list = bh_utils.query_data("select * from geo_third_task_data where status = 'ING' ")
return data_list
def get_task():
data_list = bh_utils.query_data(
"select * from geo_third_task_data where status = 'PROCESSING' ")
return data_list
if __name__ == '__main__':
try:
while 1:
commit_data_list = task_commit()
if commit_data_list:
for i in commit_data_list:
task_commit_api(i)
task_data_list = get_task()
if task_data_list:
for i in task_data_list:
get_result_api(i)
# print('sleep')
time.sleep(10)
except Exception as e:
print(e)
# {"prompt": "羽绒服品牌推荐?",
# "taskId": "21fd537f-131b-469a-9d5c-6eaccc00da2c11011d61",
# "reqId": "5f4d87b5-7c9e-4095-81d2-7fb9ccccc44cff111111e1",
# "platform": "TYQWA",
# "type": "stream",
# "thinkingEnabled": "1"}
\ No newline at end of file
import json
import secrets
from datetime import datetime
from flask import request, jsonify, Flask,Blueprint
from aidso_geo.core.feishu_snipaste import init_redis4, send_message, webhook_snipaste
feishu_app = Blueprint("feishu_app", __name__)
from loguru import logger
VERIFICATION_TOKEN = "E7LUEWjOuj71AZ0H2rNMEec0o0sYYCJR"
BOT_OPEN_ID = "ou_102577da3586dabf877d4153ffe15d93"
r = init_redis4()
from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=2)
def run_webhook_snipaste_async(prompt_list):
try:
webhook_snipaste(prompt_list)
logger.success(f"异步执行完成, 问题数量: {len(prompt_list)}")
except Exception as e:
logger.exception(f"异步执行 webhook_snipaste 失败: {e}")
@feishu_app.route("/feishu/webhook", methods=["POST"])
def webhook():
data = request.get_json(force=True)
if data.get("type") == "url_verification":
if data.get("token") != VERIFICATION_TOKEN:
return jsonify({"msg": "forbidden"}), 403
return jsonify({"challenge": data["challenge"]})
if (data.get("header") or {}).get("event_type") == "im.message.receive_v1":
msg = (data.get("event") or {}).get("message", {})
if msg.get("message_type") == "text":
mentions = msg.get("mentions") or []
is_at_bot = any(
m.get("mentioned_type") == "bot" and
(m.get("id") or {}).get("open_id") == BOT_OPEN_ID
for m in mentions
)
if is_at_bot:
text = json.loads(msg.get("content") or "{}").get("text", "")
for m in mentions:
key = m.get("key")
if key:
text = text.replace(key, "").strip()
if '|||' in text:
if r.scard('mt_third_task') > 0:
send_message(
"当前队列任务正在执行中,请稍后再试~")
else:
prompt_list = [i.strip() for i in text.split('|||') if i.strip()]
if prompt_list:
send_message("----开始执行-----")
executor.submit(run_webhook_snipaste_async, prompt_list)
logger.success(f"已提交线程池任务, 问题数量: {len(prompt_list)}")
else:
send_message("----格式错误 ||| 两边没有有效问题-----")
else:
send_message("----格式错误需要用 ||| 分割每一个问题-----")
return jsonify({"code": 0})
if __name__ == '__main__':
print(r.scard('mt_third_task'))
from flask import jsonify, request,Blueprint
import os, sys
import json
import queue
from concurrent.futures import ThreadPoolExecutor
from loguru import logger
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
from aidso_geo.models.process import process_call_back, commit_task, main_process
from aidso_geo.config.base_config import init_redis, init_redis8
from aidso_geo.utils import bh_utils, tos_utils, url_utils
line_app = Blueprint("line_app", __name__)
redis_client = init_redis()
redis_client8 = init_redis8()
MAX_WORKERS = 30
MAX_PENDING_TASKS = 200
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
pending_queue = queue.Queue(maxsize=MAX_PENDING_TASKS)
def submit_background_task(func, *args, **kwargs) -> bool:
try:
pending_queue.put_nowait(1)
except queue.Full:
return False
def _run():
try:
func(*args, **kwargs)
finally:
pending_queue.get()
pending_queue.task_done()
executor.submit(_run)
return True
def _cache_get_json(key: str):
v = redis_client.get(key)
if not v:
return None
try:
return json.loads(v)
except Exception:
# 缓存脏数据直接清掉
redis_client.delete(key)
return None
def _cache_set_json(key: str, obj: dict, ttl: int):
redis_client.set(key, json.dumps(obj, ensure_ascii=False), ex=ttl)
@line_app.route('/api/geo/task_check', methods=['GET'])
def task_check():
req_id = request.args.get('reqId')
if not req_id:
return jsonify({
"code": 400,
"status": "参数错误:reqId 不能为空",
"data": {}
})
resp_cache_key = f"geo:task_check:resp:{req_id}"
cached_resp = _cache_get_json(resp_cache_key)
if cached_resp:
return jsonify(cached_resp)
try:
data = bh_utils.query_data(f"select * from geo_commit_task where reqId ='{req_id}'")
except Exception as e:
return jsonify({
"code": 500,
"status": f"查询失败:{str(e)}",
"data": {}
})
if not data:
return jsonify({
"code": 400,
"msg": "reqId 未提交",
"data": {}
})
task_info = data[0]
status = task_info['status']
logger.success(f"{task_info['reqId']}--{task_info['platform']}--{task_info['prompt']}---{status}")
if status in ('ING', 'PROCESSING','QUEUE','PROCESS_FAIL','FAIL'):
resp = {
"code": 200,
"msg": "success",
"data": {
"status": 'ING',
"result": {}
}
}
_cache_set_json(resp_cache_key, resp, 10)
return jsonify(resp)
elif status == 'SUCCESS':
try:
result_path = f'geo/{task_info["taskId"]}/{task_info["platform"]}/result.json'
result_content = tos_utils.get_string_from_tos(result_path)
result_data = json.loads(result_content)
return jsonify({
"code": 200,
"msg": "success",
"data": {
"status": 'success',
"result": result_data
}
})
except Exception as e:
return jsonify({
"code": 500,
"status": f"获取结果失败:{str(e)}",
"data": {}
})
else:
resp = {
"code": 200,
"msg": "success",
"data": {
"status": 'ING',
"result": {}
}
}
_cache_set_json(resp_cache_key, resp, 10)
return jsonify(resp)
@line_app.route('/api/geo/task_commit', methods=['POST'])
def task_commit():
req_id = ""
try:
data = request.get_json()
if not data:
return jsonify({
"code": 400,
"msg": "请求数据为空,请检查请求体",
"reqId": req_id
})
req_id = data.get("reqId", "")
type = data.get('type',"")
platform = data.get('platform',"")
prompt = data.get('prompt',"")
required_fields = ["prompt", "taskId", "reqId", "platform", "type"]
missing_fields = [field for field in required_fields if field not in data]
if missing_fields:
return jsonify({
"code": 400,
"status": f"缺少必要字段:{', '.join(missing_fields)}",
"reqId": req_id
})
# logger.success(f"{req_id}--{platform}--{prompt}--{type}")
# ret = redis_client8.lpush("geo:task_commit:list",json.dumps(data, ensure_ascii=False))
# if ret and ret > 0:
# resp_cache_key = f"geo:task_check:resp:{req_id}"
# resp = {
# "code": 200,
# "msg": "success",
# "data": {
# "status": 'ING',
# "result": {}
# }
# }
# _cache_set_json(resp_cache_key, resp, 10)
# return jsonify({
# "code": 200,
# "msg": "任务已提交",
# "reqId": req_id
# })
#
# else:
# return jsonify({
# "code": 400,
# "msg": "提交失败",
# "reqId": req_id
# })
if type == 'stream':
insert_ok = commit_task(data, 'ING')
ok = submit_background_task(main_process, data)
if insert_ok and ok:
return jsonify({
"code": 200,
"msg": 'success',
"reqId": req_id
})
else:
logger.success(f"{data['reqId']}--{platform}--{prompt}--任务提交--{type}")
ret = redis_client.lpush("geo:task_commit:list",json.dumps(data, ensure_ascii=False))
if ret and ret > 0:
resp_cache_key = f"geo:task_check:resp:{req_id}"
resp = {
"code": 200,
"msg": "success",
"data": {
"status": 'ING',
"result": {}
}
}
if type == 'stream_batch':
_cache_set_json(resp_cache_key, resp, 600)
else:
_cache_set_json(resp_cache_key, resp, 18000)
return jsonify({
"code": 200,
"msg": "任务已提交",
"reqId": req_id
})
# ok = submit_background_task(main_process, data)
# if ok:
# return jsonify({
# "code": 200,
# "msg": 'success',
# "reqId": req_id
# })
return jsonify({
"code": 503,
"msg": "服务器繁忙,请稍后重试",
"reqId": req_id
})
except Exception as e:
return jsonify({
"code": 500,
"msg": f"服务器内部错误:{str(e)}",
"reqId": req_id
})
@line_app.route('/api/geo/call_back', methods=['POST'])
def data_call_back():
req_id = ""
try:
data = request.get_json()
task_data = data.get("task_data")
result = data.get("result")
req_id = task_data.get('reqId')
platform = task_data.get('platform')
logger.success(f"{req_id}--{platform}-------CALL_BACK")
# ret = redis_client8.lpush("geo:call_back:list",json.dumps(data, ensure_ascii=False))
#
# if ret and ret > 0:
# return jsonify({
# "code": 200,
# "msg": "任务已提交",
# "reqId": req_id
# })
# else:
# return jsonify({
# "code": 400,
# "msg": "提交失败",
# "reqId": req_id
# })
#
ok = submit_background_task(process_call_back, task_data, result)
if not ok:
return jsonify({
"code": 503,
"msg": "服务器繁忙,请稍后重试",
"reqId": req_id
})
return jsonify({
"code": 200,
"msg": "任务已提交",
"reqId": req_id
})
except Exception as e:
return jsonify({
"code": 500,
"msg": f"服务器内部错误:{str(e)}",
"reqId": req_id
})
@line_app.route('/api/geo/check_quto', methods=['GET'])
def check_quto():
req_id = ""
try:
req_id = request.args.get('reqId')
if not req_id:
return jsonify({
"code": 400,
"msg": "参数错误:reqId 不能为空",
"data": {}
})
try:
data = bh_utils.query_data(f"select * from geo_commit_task where reqId ='{req_id}'")
except Exception as e:
return jsonify({
"code": 500,
"msg": f"查询失败:{str(e)}",
"data": {}
})
if not data:
return jsonify({
"code": 400,
"msg": "reqId 未提交",
"data": {}
})
task_info = data[0]
taskId = task_info["taskId"]
platform = task_info["platform"]
quote_path = f'geo/{task_info["taskId"]}/{task_info["platform"]}/quote.txt'
result_path = f'geo/{task_info["taskId"]}/{task_info["platform"]}/result.json'
result_content = tos_utils.get_string_from_tos(result_path)
result_data = json.loads(result_content)
if result_data.get("sources"):
quote_content = tos_utils.get_string_from_tos(quote_path)
quote_data = json.loads(quote_content)
urls = []
for c in quote_data:
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
urls.append(u)
else:
urls.append(c.get('title'))
url_ids_map = url_utils.generate_numeric_url_id(urls)
for c in quote_data:
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
key = u
else:
key = c.get('title')
c['quto_id'] = url_ids_map.get(key)
bh_utils.insert_data("geo_quote_result_v2", quote_data)
tos_utils.put_string_to_tos(quote_path, quote_data)
quotes = bh_utils.query_data(
f"select site_icon as img ,platform ,quto_id as sourceId,title,url,site_name as site_name from geo_quote_result_v2 where task_id ='{taskId}' and platform = '{platform}'")
result_data['sources'] = quotes
tos_utils.put_string_to_tos(result_path,result_data)
logger.success(f"{req_id}--check_quto-----success")
return jsonify({
"code": 200,
"msg": "success",
"data": {
"status": 'success',
"result": result_data
}
})
except Exception as e:
return jsonify({
"code": 500,
"msg": f"服务器内部错误:{str(e)}",
"reqId": req_id
})
@line_app.route('/api/geo/get_mention_count', methods=['POST'])
def get_mention_count():
try:
data = request.get_json(force=True) or {}
req_ids = data.get('reqIds', [])
in_sql = ",".join([f"'{i}'" for i in list(dict.fromkeys(req_ids))])
result = bh_utils.query_data(f"""
WITH t AS (
SELECT
req_id,
arrayFilter(x -> x != '',
arrayMap(x -> replaceRegexpAll(x, '(^\\s+)|(\\s+$)', ''),
splitByChar(',', ifNull(positive_mentions, '')))
) AS pos_arr,
arrayFilter(x -> x != '',
arrayMap(x -> replaceRegexpAll(x, '(^\\s+)|(\\s+$)', ''),
splitByChar(',', ifNull(negative_mentions, '')))
) AS neg_arr
FROM geo_brand_mention_list
WHERE req_id IN ({in_sql}
)
),
pos AS (
SELECT
req_id,
uniqExact(word) AS positive_distinct,
count() AS positive_total
FROM (
SELECT req_id, arrayJoin(pos_arr) AS word
FROM t
)
GROUP BY req_id
),
neg AS (
SELECT
req_id,
uniqExact(word) AS negative_distinct,
count() AS negative_total
FROM (
SELECT req_id, arrayJoin(neg_arr) AS word
FROM t
)
GROUP BY req_id
)
SELECT
a.req_id,
ifNull(p.positive_distinct, 0) AS positive_word_count,
ifNull(p.positive_total, 0) AS positive_mention_count,
ifNull(n.negative_distinct, 0) AS negative_word_count,
ifNull(n.negative_total, 0) AS negative_mention_count
FROM
(SELECT DISTINCT req_id FROM t) a
LEFT JOIN pos p ON a.req_id = p.req_id
LEFT JOIN neg n ON a.req_id = n.req_id
ORDER BY a.req_id;
""")
return jsonify({
"code": 200,
"msg": "success",
"data": result or []
})
except Exception as e:
return jsonify({
"code": 400,
"msg": "err",
"data": []
})
import uuid
import time
import json
import secrets
from datetime import datetime
from flask import request, jsonify, Flask,Blueprint
from aidso_geo.config.base_config import init_redis8, PlatformType
from aidso_geo.utils import bh_utils, tos_utils
SYSTEM_ALLOW_PLATFORM ={i.value for i in PlatformType}
third_app = Blueprint("third_app", __name__)
redis_client = init_redis8()
def ok(data=None, msg="ok"):
return jsonify({"code": 200, "data": data, "msg": msg}), 200
def parse_platform_costs(raw_value):
"""
解析平台扣费配置
支持:
1. dict: {"DB": 10, "DP": 20}
2. json string: '{"DB": 10, "DP": 20}'
"""
if raw_value is None:
return {}
if isinstance(raw_value, dict):
result = {}
for k, v in raw_value.items():
platform = str(k).strip()
if not platform:
continue
try:
cost = int(v)
except Exception:
continue
if cost > 0:
result[platform] = cost
return result
if isinstance(raw_value, str):
raw_value = raw_value.strip()
if not raw_value:
return {}
try:
obj = json.loads(raw_value)
except Exception:
return {}
if not isinstance(obj, dict):
return {}
result = {}
for k, v in obj.items():
platform = str(k).strip()
if not platform:
continue
try:
cost = int(v)
except Exception:
continue
if cost > 0:
result[platform] = cost
return result
return {}
def calc_platform_cost(normalized_platforms, platform_costs, thinking_costs=None):
thinking_costs = thinking_costs or {}
total_cost = 0
cost_detail = {}
for item in normalized_platforms:
name = item["name"]
if name not in platform_costs:
raise ValueError(f"平台 {name} 未配置扣费积分")
base_cost = int(platform_costs[name])
if base_cost <= 0:
raise ValueError(f"平台 {name} 扣费积分必须大于 0")
thinking_extra_cost = 0
if str(item.get("thinkingEnabled", "0")) == "1":
thinking_extra_cost = int(thinking_costs.get(name, 0))
cost = base_cost + thinking_extra_cost
total_cost += cost
cost_detail[name] = {
"base_cost": base_cost,
"thinking_extra_cost": thinking_extra_cost,
"total_cost": cost
}
return total_cost, cost_detail
def get_history_used_list(channel: str):
history_userd_list = []
daily_map = redis_client.hgetall("third_geo_daily") or {}
prefix = f"{channel}"
prefix_len = len(prefix)
for key, value in daily_map.items():
if not key.startswith(prefix):
continue
day_str = key[prefix_len:]
if len(day_str) != 8 or not day_str.isdigit():
continue
history_userd_list.append({
"date": day_str,
"used": int(value) if value else 0
})
history_userd_list.sort(key=lambda x: x["date"], reverse=True)
return history_userd_list
def parse_thinking_costs(raw_value):
"""
解析 thinkingEnabled=1 时的平台叠加扣费配置
支持:
1. dict: {"DB": 5, "DP": 8}
2. json string: '{"DB": 5, "DP": 8}'
历史兼容:
1. 没有 thinking_costs 字段
2. thinking_costs 为空
3. 某个平台没有配置
都默认不叠加积分
"""
if raw_value is None:
return {}
if isinstance(raw_value, dict):
result = {}
for k, v in raw_value.items():
platform = str(k).strip()
if not platform:
continue
try:
cost = int(v)
except Exception:
continue
if cost > 0:
result[platform] = cost
return result
if isinstance(raw_value, str):
raw_value = raw_value.strip()
if not raw_value:
return {}
try:
obj = json.loads(raw_value)
except Exception:
return {}
if not isinstance(obj, dict):
return {}
result = {}
for k, v in obj.items():
platform = str(k).strip()
if not platform:
continue
try:
cost = int(v)
except Exception:
continue
if cost > 0:
result[platform] = cost
return result
return {}
def err(code, msg, http_status=400):
return jsonify({"code": code, "data": None, "msg": msg}), http_status
def gen_authorization():
return secrets.token_hex(16).upper()
type_map ={
'0':"stream",
'1':"stream_batch"
}
def parse_allow_platforms(raw_value):
if raw_value is None:
return set()
if isinstance(raw_value, list):
return {str(x).strip() for x in raw_value if str(x).strip()}
if isinstance(raw_value, str):
raw_value = raw_value.strip()
if not raw_value:
return set()
try:
arr = json.loads(raw_value)
if not isinstance(arr, list):
return set()
return {str(x).strip() for x in arr if str(x).strip()}
except Exception:
return set()
return set()
def get_auth_info_by_token(auth_token: str):
if not auth_token:
return None
sql = f"""
select channel, authorization, allow_platforms, status, daily_limit, total_limit,platform_costs,thinking_costs
from geo_third_token
where authorization = '{auth_token}'
limit 1
"""
data = bh_utils.query_data(sql)
if not data:
return None
return data[0]
def get_today_limit_info(channel: str, daily_limit: int):
today_pt = datetime.now().strftime("%Y%m%d")
redis_limit_key = f"{channel}{today_pt}"
used_count = redis_client.hget("third_geo_daily", redis_limit_key)
used_count = int(used_count) if used_count else 0
remain_count = max(daily_limit - used_count, 0)
return redis_limit_key, used_count, remain_count
def get_total_limit_info(channel: str, total_limit: int):
redis_total_key = channel
used_count = redis_client.hget("third_geo_total", redis_total_key)
used_count = int(used_count) if used_count else 0
remain_count = max(total_limit - used_count, 0)
return redis_total_key, used_count, remain_count
@third_app.route('/open/auth/create', methods=['POST'])
def create_auth():
data = request.get_json(silent=True)
if not isinstance(data, dict):
return err(400, "invalid json body (must be an object)")
channel = str(data.get("channel", "")).strip()
daily_limit = data.get("daily_limit", 50)
total_limit = data.get("total_limit")
remark = str(data.get("remark", "")).strip()
allow_platforms = data.get("allow_platforms")
platform_costs = data.get("platform_costs")
thinking_costs = data.get("thinking_costs")
if not channel:
return err(400, "missing required field: channel")
if allow_platforms in (None, "", [], {}):
return err(400, "missing required field: allow_platforms")
if not isinstance(allow_platforms, list) or len(allow_platforms) == 0:
return err(400, "allow_platforms must be a non-empty list")
normalized_platforms = []
seen = set()
for idx, p in enumerate(allow_platforms):
if not isinstance(p, str):
return err(400, f"allow_platforms[{idx}] must be string")
p = p.strip()
if not p:
return err(400, f"allow_platforms[{idx}] cannot be empty")
if p not in SYSTEM_ALLOW_PLATFORM:
return err(400, f"allow_platforms[{idx}] '{p}' not allowed")
if p in seen:
return err(400, f"duplicate allow_platforms value: {p}")
seen.add(p)
normalized_platforms.append(p)
normalized_platform_costs = {}
normalized_thinking_costs = {}
# platform_costs 不传,表示保持老逻辑:一个平台扣 1
# platform_costs 传了,就必须是 object,并且 allow_platforms 里的每个平台都要配置积分
if platform_costs not in (None, "", [], {}):
if not isinstance(platform_costs, dict):
return err(400, "platform_costs must be an object")
for p in normalized_platforms:
if p not in platform_costs:
return err(400, f"platform_costs missing platform: {p}")
try:
cost = int(platform_costs[p])
except Exception:
return err(400, f"platform_costs.{p} must be int")
if cost <= 0:
return err(400, f"platform_costs.{p} must be > 0")
normalized_platform_costs[p] = cost
if thinking_costs not in (None, "", [], {}):
if not isinstance(thinking_costs, dict):
return err(400, "thinking_costs must be an object")
for p, v in thinking_costs.items():
p = str(p).strip()
if not p:
return err(400, "thinking_costs platform cannot be empty")
if p not in normalized_platforms:
return err(400, f"thinking_costs platform '{p}' not in allow_platforms")
try:
cost = int(v)
except Exception:
return err(400, f"thinking_costs.{p} must be int")
if cost < 0:
return err(400, f"thinking_costs.{p} must be >= 0")
if cost > 0:
normalized_thinking_costs[p] = cost
try:
daily_limit = int(daily_limit)
except Exception:
return err(400, "daily_limit must be int")
if daily_limit <= 0:
return err(400, "daily_limit must be > 0")
if total_limit is None:
return err(400, "missing required field: total_limit")
try:
total_limit = int(total_limit)
except Exception:
return err(400, "total_limit must be int")
if total_limit <= 0:
return err(400, "total_limit must be > 0")
if total_limit < daily_limit:
return err(400, "total_limit must be >= daily_limit")
exists = bh_utils.query_data(
f"""
select channel, authorization, daily_limit, total_limit, allow_platforms,platform_costs,thinking_costs
from geo_third_token
where channel = '{channel}' and status = 1
limit 1
"""
)
if exists:
_, daily_used_count, daily_remain_count = get_today_limit_info(
exists[0]["channel"],
int(exists[0]["daily_limit"])
)
_, total_used_count, total_remain_count = get_total_limit_info(
exists[0]["channel"],
int(exists[0]["total_limit"])
)
return ok({
"channel": exists[0]["channel"],
"authorization": exists[0]["authorization"],
"daily_limit": int(exists[0]["daily_limit"]),
"daily_used_count": daily_used_count,
"daily_remain_count": daily_remain_count,
"total_limit": int(exists[0]["total_limit"]),
"total_used_count": total_used_count,
"total_remain_count": total_remain_count,
"allow_platforms": json.loads(exists[0]["allow_platforms"]) if exists[0].get("allow_platforms") else [],
"platform_costs": json.loads(exists[0]["platform_costs"]) if exists[0].get("platform_costs") else {},
"thinking_costs": json.loads(exists[0]["thinking_costs"]) if exists[0].get("thinking_costs") else {}
}, msg="channel already exists")
authorization = gen_authorization()
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
insert_data = [{
"channel": channel,
"authorization": authorization,
"allow_platforms": json.dumps(normalized_platforms, ensure_ascii=False),
"status": 1,
"daily_limit": daily_limit,
"total_limit": total_limit,
"remark": remark,
"create_time": now_str,
"update_time": now_str,
"platform_costs": json.dumps(normalized_platform_costs, ensure_ascii=False),
"thinking_costs": json.dumps(normalized_thinking_costs, ensure_ascii=False)
}]
if bh_utils.insert_data("geo_third_token", insert_data):
return ok({
"channel": channel,
"authorization": authorization,
"allow_platforms": normalized_platforms,
"daily_limit": daily_limit,
"daily_used_count": 0,
"daily_remain_count": daily_limit,
"total_limit": total_limit,
"total_used_count": 0,
"total_remain_count": total_limit,
"platform_costs": normalized_platform_costs,
"thinking_costs": normalized_thinking_costs
})
else:
return err(500, "生成Authorization失败", 500)
@third_app.route('/open/mt/task_commit', methods=['POST'])
def mt_task_commit():
auth_header = request.headers.get('Authorization')
if not auth_header:
return err(401, "缺少Authorization请求头", 401)
auth_info = get_auth_info_by_token(auth_header)
if not auth_info:
return err(401, "Authorization参数错误", 401)
if int(auth_info.get("status", 0)) != 1:
return err(401, "Authorization已禁用", 401)
channel = auth_info["channel"]
daily_limit = int(auth_info.get("daily_limit", 50))
total_limit = int(auth_info.get("total_limit", 0))
channel_allow_platforms = parse_allow_platforms(auth_info.get("allow_platforms"))
if not channel_allow_platforms:
return err(401, "当前Authorization未配置允许平台", 401)
data = request.get_json(silent=True)
if not isinstance(data, dict):
return err(400, "invalid json body (must be an object)")
prompt = data.get("prompt")
plat_form = data.get("platform")
if not prompt:
return err(400, "missing required field: prompt")
if plat_form in (None, "", [], {}):
return err(400, "missing required field: platform")
if not isinstance(plat_form, list) or len(plat_form) == 0:
return err(400, "platform must be a non-empty list")
normalized = []
seen = set()
for idx, item in enumerate(plat_form):
if not isinstance(item, dict):
return err(400, f"platform[{idx}] must be an object")
name = item.get("name")
if not name:
return err(400, f"platform[{idx}].name is required")
if name not in SYSTEM_ALLOW_PLATFORM:
return err(400, f"platform[{idx}].name '{name}' not allowed")
if name not in channel_allow_platforms:
return err(400, f"platform[{idx}].name '{name}' is not allowed for current Authorization")
if name in seen:
return err(400, f"duplicate platform name: {name}")
seen.add(name)
thinking_enabled = str(item.get("thinkingEnabled", "0"))
if thinking_enabled not in {"0", "1"}:
return err(400, f"platform[{idx}].thinkingEnabled must be '0' or '1'")
type = str(item.get("type", "1"))
if type not in {"0", "1"}:
return err(400, f"platform[{idx}].type must be '0' or '1'")
normalized.append({
"name": name,
"thinkingEnabled": thinking_enabled,
"type": type
})
platform_costs = parse_platform_costs(auth_info.get("platform_costs"))
thinking_costs = parse_thinking_costs(auth_info.get("thinking_costs"))
if platform_costs:
try:
cost, cost_detail = calc_platform_cost(normalized, platform_costs, thinking_costs)
except ValueError as e:
return err(400, str(e))
else:
cost = 0
cost_detail = {}
for item in normalized:
name = item["name"]
base_cost = 1
thinking_extra_cost = 0
if str(item.get("thinkingEnabled", "0")) == "1":
thinking_extra_cost = int(thinking_costs.get(name, 0))
item_cost = base_cost + thinking_extra_cost
cost += item_cost
daily_redis_limit_key, daily_used_count, _ = get_today_limit_info(channel, daily_limit)
if daily_used_count + cost > daily_limit:
return err(
400,
f"超过单日最大调用次数 {daily_limit}次/天,"
f"本次提交消耗 {cost},当前已使用 {daily_used_count},剩余可用 {max(daily_limit - daily_used_count, 0)}"
)
total_redis_limit_key, total_used_count, _ = get_total_limit_info(channel, total_limit)
if total_used_count + cost > total_limit:
return err(
400,
f"超过渠道总最大调用次数 {total_limit}次,"
f"本本次提交消耗 {cost},当前已使用 {total_used_count},剩余可用 {max(total_limit - total_used_count, 0)}"
)
task_id = str(uuid.uuid4())
req_ids = {}
request_list = []
now_ts = int(time.time())
for p in normalized:
req_id = str(uuid.uuid4())
name = p["name"]
request_list.append({
"prompt": prompt,
"taskId": task_id,
"reqId": req_id,
"platform": name,
"thinkingEnabled": p["thinkingEnabled"],
"type": type_map[p["type"]],
"insertime": now_ts,
"status": "ING",
"channel": channel
})
req_ids[name] = req_id
if bh_utils.insert_data("geo_third_task_data", request_list):
current_daily_count = redis_client.hincrby("third_geo_daily", daily_redis_limit_key, cost)
current_total_count = redis_client.hincrby("third_geo_total", total_redis_limit_key, cost)
return ok({
"taskId": task_id,
"reqIds": req_ids,
})
else:
return err(500, "服务器繁忙请重试", 500)
@third_app.route('/open/mt/get_result', methods=['GET'])
def mt_get_result():
auth_header = request.headers.get('Authorization')
if not auth_header:
return err(401, "缺少Authorization请求头", 401)
auth_info = get_auth_info_by_token(auth_header)
if not auth_info:
return err(401, "Authorization参数错误", 401)
if int(auth_info.get("status", 0)) != 1:
return err(401, "Authorization已禁用", 401)
channel = auth_info["channel"]
daily_limit = int(auth_info.get("daily_limit", 50))
total_limit = int(auth_info.get("total_limit", 0))
_, daily_used_count, daily_remain_count = get_today_limit_info(channel, daily_limit)
_, total_used_count, total_remain_count = get_total_limit_info(channel, total_limit)
req_id = request.args.get('reqId')
if not req_id:
return err(400, "missing required param: reqId")
try:
data = bh_utils.query_data(
f"select * from geo_third_task_data where reqId = '{req_id}' and channel = '{channel}' limit 1"
)
except Exception as e:
return jsonify({
"code": 500,
"data": {},
"msg": f"查询失败:{str(e)}"
}), 500
if not data:
return jsonify({
"code": 400,
"msg": "reqId 未提交",
"data": {
"total_limit": total_limit,
"total_used_count": total_used_count,
"total_remain_count": total_remain_count
}
}), 400
task_info = data[0]
status = task_info['status']
prompt = task_info['prompt']
task_id = task_info['taskId']
platform = task_info['platform']
if status in ('ING', 'PROCESSING'):
return jsonify({
"code": 200,
"msg": "success",
"data": {
"prompt": prompt,
"status": "ING",
"result": []
}
}), 200
if status == 'SUCCESS':
file_list = {
"search_word": "search_keyword.txt",
"quote": "quote.txt",
"think": "think.txt",
"context": "context.txt",
"suggestions": "suggestions.txt",
"rich_media_block": "rich_media_block.txt"
}
result = []
for k, v in file_list.items():
content = tos_utils.get_string_from_tos(f"geo/{task_id}/{platform}/{v}")
result.append({f"{k}":content if content else ""})
return jsonify({
"code": 200,
"msg": "success",
"data": {
"status": "SUCCESS",
"prompt": prompt,
"result": result
}
}), 200
@third_app.route('/open/mt/get_usage', methods=['GET'])
def mt_get_usage():
auth_header = request.headers.get('Authorization')
if not auth_header:
return err(401, "缺少Authorization请求头", 401)
auth_info = get_auth_info_by_token(auth_header)
if not auth_info:
return err(401, "Authorization参数错误", 401)
if int(auth_info.get("status", 0)) != 1:
return err(401, "Authorization已禁用", 401)
channel = auth_info["channel"]
daily_limit = int(auth_info.get("daily_limit", 50))
total_limit = int(auth_info.get("total_limit", 0))
_, today_used, _ = get_today_limit_info(channel, daily_limit)
_, total_used, _ = get_total_limit_info(channel, total_limit)
history_userd_list = get_history_used_list(channel)
return ok({
"today_used": today_used,
"total_used": total_used
# "history_used_list": history_userd_list
})
import json
import time
import os
import sys
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
from aidso_geo.models.process import process_success, platform_process, scheduler, task_send_queue
from aidso_geo.utils.bh_utils import query_data
from loguru import logger
def safe_load(v):
if not v:
return v
if isinstance(v, (dict, list)):
return v
try:
return json.loads(v)
except Exception:
return v
def preprocess_item(item):
"""
统一处理需要反序列化的字段
"""
if item.get("comWordsMap"):
item["comWordsMap"] = safe_load(item.get("comWordsMap"))
if item.get("brandWords"):
item["brandWords"] = safe_load(item.get("brandWords"))
if item.get("comWords"):
item["comWords"] = safe_load(item.get("comWords"))
if item.get("keywords"):
item["keywords"] = safe_load(item.get("keywords"))
if item.get("productWordsMap"):
item["productWordsMap"] = safe_load(item.get("productWordsMap"))
return item
def log_task(item):
logger.success(
f"{item.get('reqId')}---{item.get('platform')}---{item.get('prompt')}---{item.get('type')}"
)
def run_parallel_task(data_list, handler, max_workers=5):
"""
并发执行任务
"""
if not data_list:
return
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_map = {executor.submit(handler, item): item for item in data_list}
for future in as_completed(future_map):
item = future_map[future]
try:
future.result()
except Exception as e:
logger.exception(
f"任务执行失败: taskId={item.get('taskId')}, reqId={item.get('reqId')}, error={e}"
)
def processing_worker():
"""
线程1:
处理 status = PROCESSING 的任务
"""
while True:
try:
process_list = query_data(
"select * from geo_commit_task "
"where status = 'PROCESSING' "
"and insertime > unix_timestamp() - 36000"
)
if process_list:
for idx, item in enumerate(process_list):
process_list[idx] = preprocess_item(item)
log_task(process_list[idx])
run_parallel_task(process_list, platform_process, max_workers=10)
except Exception as e:
logger.exception(f"processing_worker 主循环异常: {e}")
time.sleep(180)
def stream_worker():
"""
线程2:
处理 type = stream 的任务
"""
while True:
try:
stream_list = query_data(
"select * from geo_commit_task "
"where type = 'stream' "
"and status != 'SUCCESS' "
"and unix_timestamp() - insertime > 480 "
"order by insertime desc"
)
if stream_list:
for idx, item in enumerate(stream_list):
stream_list[idx] = preprocess_item(item)
log_task(stream_list[idx])
run_parallel_task(stream_list, process_success, max_workers=3)
except Exception as e:
logger.exception(f"stream_worker 主循环异常: {e}")
time.sleep(180)
def stream_batch_worker():
"""
线程3:
处理 type = stream_batch 的任务
"""
while True:
try:
stream_batch_list = query_data(
"select * from geo_commit_task "
"where type = 'stream_batch' "
"and status != 'SUCCESS' "
"and unix_timestamp() - insertime > 18000 "
"order by insertime desc"
)
# stream_batch_list = query_data("select * from geo_commit_task where taskId = '000aa9c0-25f0-4f87-84cb-95309d34bb12' and platform = 'TYQWA'")
if stream_batch_list:
for idx, item in enumerate(stream_batch_list):
stream_batch_list[idx] = preprocess_item(item)
log_task(stream_batch_list[idx])
run_parallel_task(stream_batch_list, process_success, max_workers=3)
except Exception as e:
logger.exception(f"stream_batch_worker 主循环异常: {e}")
time.sleep(180)
def batch_worker():
while True:
try:
batch_list = query_data(
"select * from geo_commit_task "
"where type = 'batch' "
"and status != 'SUCCESS' "
"and unix_timestamp() - insertime > 60000 "
"order by insertime desc"
)
if batch_list:
for idx, item in enumerate(batch_list):
batch_list[idx] = preprocess_item(item)
log_task(batch_list[idx])
run_parallel_task(batch_list, process_success, max_workers=10)
except Exception as e:
logger.exception(f"batch_worker 主循环异常: {e}")
time.sleep(180)
def main():
thread_list = [
threading.Thread(target=processing_worker, name="processing_worker", daemon=True),
threading.Thread(target=stream_worker, name="stream_worker", daemon=True),
threading.Thread(target=stream_batch_worker, name="stream_batch_worker", daemon=True),
threading.Thread(target=batch_worker, name="batch_worker", daemon=True),
]
for t in thread_list:
t.start()
logger.info(f"线程启动成功: {t.name}")
while True:
for t in thread_list:
if not t.is_alive():
logger.error(f"线程已退出: {t.name}")
time.sleep(60)
if __name__ == "__main__":
main()
import json
import time
import threading
import signal
from loguru import logger
from concurrent.futures import ThreadPoolExecutor, as_completed
from aidso_geo.config.base_config import init_redis8
from aidso_geo.models.process import main_process, process_call_back
from aidso_geo.utils import bh_utils
redis_client8 = init_redis8()
RUNNING = True
def stop_handler(signum, frame):
global RUNNING
logger.success("开始退出")
RUNNING = False
def handle_item(data):
if data.get('comWordsMap'):
data['comWordsMap'] = json.loads(data.get('comWordsMap'))
if data.get('brandWords'):
data['brandWords'] = json.loads(data.get('brandWords'))
if data.get('comWords'):
data['comWords'] = json.loads(data.get('comWords'))
if data.get('keywords'):
data['keywords'] = json.loads(data.get('keywords'))
if data.get('productWordsMap'):
data['productWordsMap'] = json.loads(data.get('productWordsMap'))
return main_process(data)
def handle_one_task(task):
try:
task_data = task.get("task_data")
result = task.get("result")
process_call_back(task_data, result)
except Exception as e:
print(f"回调任务处理失败: {e}, task={task}")
def run_stream_task():
global RUNNING
while RUNNING:
status = 'ING'
stream_task = bh_utils.query_data(
f"select * from geo_commit_task where type = 'stream' and status = '{status}' limit 20"
)
if not stream_task:
time.sleep(5)
continue
if stream_task:
with ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(handle_item, item) for item in stream_task]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"任务执行失败: {e}")
def run_callback_task():
global RUNNING
REDIS_KEY = "geo:call_back:list"
while RUNNING:
batch = []
# 一次从 redis 取 20 个
for _ in range(20):
result = redis_client8.rpop(REDIS_KEY)
if not result:
break
batch.append(json.loads(result))
# 没数据就休眠,避免死循环空转
if not batch:
time.sleep(1)
continue
# 20 个并发处理
with ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(handle_one_task, task) for task in batch]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"线程执行异常: {e}")
if __name__ == '__main__':
# signal.signal(signal.SIGINT, stop_handler)
# signal.signal(signal.SIGTERM, stop_handler)
t1 = threading.Thread(target=run_stream_task, name="stream-thread")
# t2 = threading.Thread(target=run_callback_task, name="callback-thread")
t1.start()
# t2.start()
# logger.success("stream-thread 线程已启动")
# logger.success("callback-thread 线程已启动")
t1.join()
# t2.join()
# logger.success("所有线程已退出")
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
from datetime import datetime
from aidso_geo.config.base_config import init_redis8
from aidso_geo.utils import bh_utils
redis_client = init_redis8()
def safe_int(value, default=0):
try:
if value is None:
return default
return int(value)
except Exception:
return default
def get_active_channels():
"""
查询所有启用中的第三方 channel
"""
sql = """
select channel, daily_limit, total_limit
from geo_third_token
where status = 1
"""
return bh_utils.query_data(sql) or []
def sync_third_usage_once():
"""
每小时同步第三方 token 用量快照。
读取:
1. Redis hash third_geo_daily:field = channel + YYYYMMDD
2. Redis hash third_geo_total:field = channel
写入:
geo_third_usage_snapshot
"""
now = datetime.now()
today_pt = now.strftime("%Y%m%d")
sync_time = now.strftime("%Y-%m-%d %H:%M:%S")
insertime = int(time.time())
try:
channels = get_active_channels()
if not channels:
logger.info(f"[{sync_time}] sync_third_usage_once 无启用中的 channel")
return
# 一次性拉 Redis hash,避免用户多时频繁 hget
daily_map = redis_client.hgetall("third_geo_daily") or {}
total_map = redis_client.hgetall("third_geo_total") or {}
insert_rows = []
for item in channels:
channel = str(item.get("channel", "")).strip()
if not channel:
continue
daily_limit = safe_int(item.get("daily_limit"), 0)
total_limit = safe_int(item.get("total_limit"), 0)
daily_key = f"{channel}{today_pt}"
today_used = safe_int(daily_map.get(daily_key), 0)
total_used = safe_int(total_map.get(channel), 0)
insert_rows.append({
"channel": channel,
"pt": today_pt,
"daily_limit": daily_limit,
"total_limit": total_limit,
"today_used": today_used,
"total_used": total_used,
"today_remain": max(daily_limit - today_used, 0),
"total_remain": max(total_limit - total_used, 0),
"sync_time": sync_time,
"insertime": insertime
})
if not insert_rows:
logger.info(f"[{sync_time}] sync_third_usage_once 无可写入数据")
return
ok = bh_utils.insert_data("geo_third_usage_snapshot", insert_rows)
if ok:
logger.info(f"[{sync_time}] sync_third_usage_once 同步成功 rows={len(insert_rows)}")
else:
logger.error(f"[{sync_time}] sync_third_usage_once 同步失败 rows={len(insert_rows)}")
except Exception as e:
logger.exception(f"[{sync_time}] sync_third_usage_once 执行异常: {e}")
if __name__ == '__main__':
sync_third_usage_once()
\ No newline at end of file
import json
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def thinking_enabled_fragments(json_data):
search_word = []
for i in json_data:
data = i.get('queries')
search_word = [item['query'] for item in data]
return search_word
def baiduai_process_original_data(file_path):
"""处理文件内容,提取思考内容、响应内容和URL列表"""
url_list = []
rich_media_block = []
think_content = ""
response_content = ""
search_keyword = []
is_think = False
think_bool = False
suggestions = []
response_bool = False
try:
original_content = get_string_from_tos(file_path)
# 按空行分割内容,过滤空字符串
content_list = [item for item in original_content.split("\n") if item]
for item in content_list:
if item.startswith("event:"):
continue
if item.startswith("data:"):
try:
# 提取并解析JSON数据
data_str = item.split("data:")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue # 跳过格式错误的数据
data = json_data.get("data")
if data :
stage_data = data.get('message').get('metaData').get('speedInfo').get('stage')
if stage_data == 'SPEED_THINKING_STAGE':
if data.get('message').get('content').get('generator').get('data'):
if isinstance(data.get('message').get('content').get('generator').get('data').get('reasoningContent'),str):
think_content += data.get('message').get('content').get('generator').get('data').get('reasoningContent')
if stage_data == 'SPEED_CONTENT_STAGE':
if data.get('message').get('content').get('generator').get('data'):
value = data.get('message').get('content').get('generator').get('data').get('value')
items = data.get('message').get('content').get('generator').get('data').get('items')
if value:
if isinstance(value, str):
response_content+=value
if items:
if isinstance(items, list):
for ite in items:
rich_media_block.append(ite)
if data.get('message').get('content').get('generator').get('data').get("items") and response_content == "":
response_content += "生成了图片"
if stage_data == 'SPEED_THINKING_STAGE':
if data.get('message').get('content').get('generator').get('data'):
if data.get('message').get('content').get('generator').get('data').get('referenceList'):
url_list+=data.get('message').get('content').get('generator').get('data').get('referenceList')
if stage_data == 'SPEED_GUIDE_STAGE':
if data.get('message').get('content').get('generator').get('data'):
if data.get('message').get('content').get('generator').get('data').get('recommendWord'):
for i in data.get('message').get('content').get('generator').get('data').get('recommendWord'):
if i.get('modelType') == 0:
suggestions.append(i.get('text'))
if data.get('message').get('content'):
if data.get('message').get('content').get('generator'):
if data.get('message').get('content').get('generator').get('antiFlag') and data.get('message').get('content').get('generator').get('isFinished') == True:
response_content+='我们换个话题聊聊吧'
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content, suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions, rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path = "geo/21430906-2656-4ba4-b884-5f937745734c/BDAI/original.text"
baiduai_process_original_data(file_path)
# 男子驾车撞4辆电动车致3伤
import json
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def deepseek_android_process_original_data(file_path):
"""处理文件内容,提取思考内容、响应内容和URL列表"""
url_list = ""
think_content = ""
response_content = ""
search_keyword = []
is_think = False
think_bool = False
suggestions = []
response_bool = False
try:
original_content = get_string_from_tos(file_path)
# 按空行分割内容,过滤空字符串
content_list = [item for item in original_content.split("\n\n") if item]
for item in content_list:
if item.startswith("event: "):
# 可根据需要补充event数据处理逻辑
continue
# 处理data类型数据
if item.startswith("data: "):
try:
# 提取并解析JSON数据
data_str = item.split("data: ")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue
if isinstance(json_data.get('v'), dict):
if json_data.get('v').get('response').get('thinking_enabled') == True:
is_think = True
fragments = json_data.get('v').get('response').get('fragments')
if fragments:
if isinstance(fragments, list):
if fragments[0].get('type') == 'RESPONSE':
response_content+=fragments[0].get('content')
if fragments[0].get('type') == 'THINK':
think_content+=fragments[0].get('content')
think_bool=True
if fragments[0].get('type') == 'SEARCH':
if isinstance(fragments[0].get('queries'),list):
for sea in fragments[0].get('queries'):
search_keyword.append(sea.get('query'))
if is_think:
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'SEARCH':
for i in json_data.get('v')[0].get('queries'):
search_keyword.append(i.get('query'))
if json_data.get('p') == 'response' and json_data.get('v')[0].get('p') =='fragments' :
if isinstance(json_data.get('v')[0].get('v'),list):
for q in json_data.get('v')[0].get('v'):
if q.get('type') =='TOOL_SEARCH':
if isinstance(q.get('queries'),list):
for que in q.get('queries'):
search_keyword.append(que.get('query'))
# if json_data.get('v')[0].get('v')[0].get('queries'):
# for q in json_data.get('v')[0].get('v')[0].get('queries'):
# search_keyword.append(q.get('query'))
if json_data.get('p') == 'response/search_results' or json_data.get('p') == 'response/fragments/-1/results':
url_list = json_data.get('v')
if json_data.get('p') == 'response' :
if json_data.get('v')[0].get('v') == 'FINISHED':
continue
if isinstance(json_data.get('v')[0].get('v'),list):
if json_data.get('v')[0].get('v')[0].get('type') == 'THINK':
think_content+=json_data.get('v')[0].get('v')[0].get('content')
think_bool = True
continue
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'THINK':
think_content += json_data.get('v')[0].get('content')
think_bool = True
continue
if json_data.get('p') == 'response/fragments/-1/elapsed_secs':
think_bool = False
continue
if think_bool:
if isinstance(json_data.get('v'), str):
think_content+=json_data.get('v')
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'RESPONSE':
response_content += json_data.get('v')[0].get('content')
response_bool = True
continue
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'TIP':
response_bool = False
continue
if json_data.get('p') == 'response':
response_bool = False
continue
if response_bool:
if isinstance(json_data.get('v'), str):
response_content += json_data.get('v')
else:
if json_data.get('p') == 'response/fragments/-1/content':
if isinstance(json_data.get('v'), str):
response_content+=json_data.get('v')
response_bool=True
continue
if json_data.get('p') =='response/fragments/-1/results':
url_list = json_data.get('v')
if json_data.get('p') =='response/fragments' and json_data.get('v')[0].get('type') =='SEARCH':
for i in json_data.get('v')[0].get('queries'):
search_keyword.append(i.get('query'))
if json_data.get('p') =='response':
for i in json_data.get('v'):
if i.get('p') == 'fragments':
if i.get('v'):
for k in i.get('v'):
if k.get('queries'):
for q in k.get('queries'):
search_keyword.append(q.get('query'))
if json_data.get('p') =='response/fragments' and json_data.get('v')[0].get('type') =='RESPONSE':
response_content += json_data.get('v')[0].get('content')
response_bool = True
continue
if json_data.get('p') == 'response/fragments':
if json_data.get('v')[0].get('v'):
if json_data.get('v')[0].get('v')[0].get('type') =='RESPONSE':
response_content += json_data.get('v')[0].get('v')[0].get('content')
response_bool = True
continue
if json_data.get('p') == 'response' :
if json_data.get('v')[0].get('v') == 'FINISHED':
response_bool = False
continue
if isinstance(json_data.get('v')[0].get('v'),list):
if json_data.get('v')[0].get('v')[0].get('type') =='RESPONSE':
response_content += json_data.get('v')[0].get('v')[0].get('content')
response_bool = True
continue
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'TIP':
response_bool = False
continue
if json_data.get('p') == 'response/status' and json_data.get('v')== 'FINISHED':
response_bool = False
continue
if response_bool:
if isinstance(json_data.get('v'),str):
response_content += json_data.get('v')
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path1 = 'geo/b1480658-f379-4ca7-8897-51ffef64b2d0/DPA/original.text'
file_path2 = 'geo/799bfc72-60dc-4333-b551-3e1d284c7570/DPA/original.text'
# file_path1 = "geo/0f59c196-aff1-46a5-aa50-5c792ebf481e/DPA/original.text"
# file_path = "geo/0c0abba3-9122244-wwwsss4e2dddd1ssssaaass-96aaaa2wwww1-10086/DPA/original.text"
# file_path2 = "geo/0a715fc6-ecc8-4e48-baf1-abc0c9477c1e/DPA/original.text"
deepseek_android_process_original_data(file_path2)
import json
import traceback
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos, put_string_to_tos
def thinking_enabled_fragments(json_data):
search_word = []
for i in json_data:
data = i.get('queries')
search_word = [item['query'] for item in data]
return search_word
def deepseek_process_original_data(file_path):
"""处理文件内容,提取思考内容、响应内容和URL列表"""
url_list = ""
think_content = ""
response_content = ""
search_keyword = []
is_think = False
think_bool = False
suggestions = []
response_bool = False
# 按空行分割内容,过滤空字符串
try:
original_content = get_string_from_tos(file_path)
content_list = [item for item in original_content.split("\n\n") if item]
for item in content_list:
if item.startswith("event: "):
# 可根据需要补充event数据处理逻辑
continue
# 处理data类型数据
if item.startswith("data: "):
try:
# 提取并解析JSON数据
data_str = item.split("data: ")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue # 跳过格式错误的数据
if isinstance(json_data.get('v'),dict):
if json_data.get('v').get('response').get('thinking_enabled') == True:
is_think=True
if json_data.get('v').get('response').get('search_enabled') == True:
if len(json_data.get('v').get('response').get('fragments'))>0:
query_list = json_data.get('v').get('response').get('fragments')[0].get('queries')
if query_list:
result = [item.get('query', '') for item in query_list]
search_keyword.extend(result)
if is_think:
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'SEARCH':
search_keyword.append(json_data.get('v')[0].get('queries')[0].get('query'))
if json_data.get('p') == 'response/fragments/0/results' or json_data.get(
'p') == 'response/fragments/-1/results':
url_list = json_data.get('v')
if json_data.get('p') =='response/fragments' and json_data.get('v')[0].get('type') == 'THINK':
think_bool = True
think_content+=json_data.get('v')[0].get('content')
continue
if json_data.get('p') == 'response' :
if isinstance(json_data.get('v')[0].get('v'),list):
if json_data.get('v')[0].get('v')[0].get('type') == 'TOOL_SEARCH':
for q in json_data.get('v')[0].get('v')[0].get('queries'):
search_keyword.append(q.get('query'))
if json_data.get('v')[1].get('p') == 'fragments' or json_data.get('v')[1].get('p') == 'response/fragments':
if json_data.get('v')[1].get('v')[0].get('type') == 'THINK':
think_content+=json_data.get('v')[1].get('v')[0].get('content')
think_bool = True
continue
if json_data.get('v')[0].get('p') == 'fragments' or json_data.get('v')[0].get('p') == 'response/fragments':
if json_data.get('v')[0].get('v')[0].get('type') == 'THINK':
think_content+=json_data.get('v')[0].get('v')[0].get('content')
think_bool = True
continue
if json_data.get('v')[0].get('v') == 'FINISHED':
response_bool = False
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'RESPONSE':
think_bool = False
response_bool = True
response_content += json_data.get('v')[0].get('content')
continue
if json_data.get('p') == 'response/fragments/1/elapsed_secs':
think_bool = False
if json_data.get('p') == 'response/fragments/-1/elapsed_secs':
think_bool = False
if json_data.get('p') == 'response/fragments':
response_bool = False
if response_bool:
if isinstance(json_data.get('v'), str):
response_content += json_data.get('v')
if think_bool:
if isinstance(json_data.get('v'), str):
think_content+=json_data.get('v')
else:
if json_data.get('p') == 'response/fragments/-1/content':
response_bool = True
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'SEARCH':
search_keyword.append(json_data.get('v')[0].get('queries')[0].get('query'))
if json_data.get('p') == 'response/fragments/0/results' or json_data.get('p') == 'response/fragments/-1/results':
url_list = json_data.get('v')
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'RESPONSE':
response_content+=json_data.get('v')[0].get('content')
response_bool = True
continue
if json_data.get('p') == 'response':
if json_data.get('v')[1].get('p') == 'fragments':
if json_data.get('v')[1].get('v')[0].get('type') == 'RESPONSE':
response_content+=json_data.get('v')[1].get('v')[0].get('content')
response_bool = True
continue
if json_data.get('v')[0].get('p') == 'fragments':
if json_data.get('v')[0].get('v')[0].get('type') == 'RESPONSE':
response_content+=json_data.get('v')[0].get('v')[0].get('content')
response_bool = True
continue
if json_data.get('v')[0].get('v') == 'FINISHED':
response_bool = False
if json_data.get('p') == 'response/fragments' and json_data.get('v')[0].get('type') == 'TIP':
response_bool= False
continue
if json_data.get('v') == 'FINISHED':
response_bool = False
if response_bool:
if isinstance(json_data.get('v'),str):
response_content+=json_data.get('v')
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
#
if __name__ == '__main__':
file_path = "geo/a374ca2e-eb45-4c2d-badd-9219af98ff38/DP/original.text"
deepseek_process_original_data(file_path)
import json
import traceback
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def doubao_mobile_process_original_data(file_path):
url_list = []
think_content = ""
response_content = ""
search_keyword = []
suggestions = set()
rich_media_block = []
is_think = False
think_bool = False
response_bool = False
try:
original_content = get_string_from_tos(file_path)
content_list = original_content.split("\n")
for i in content_list:
if not i.startswith("data:"):
continue
payload = i[len("data:"):].lstrip()
if not payload:
continue
try:
json_content = json.loads(payload)
except (IndexError, json.JSONDecodeError):
continue
if json_content.get('content'):
if json_content.get('content').get('content_block'):
if json_content.get('content').get('content_block')[0].get('block_type') == 10040:
is_think = True
think_bool = True
continue
if json_content.get("content").get("content_type") == 9999 and \
json_content.get("content").get("content_block")[0].get('block_type') == 10050:
for i in json_content.get("content").get("content_block")[0].get('content').get(
'rich_media_block').get('creations'):
rich_media_block.append(i.get('video') or i.get('ugc_item'))
if json_content.get("content").get("content_type") == 9999 and \
json_content.get("content").get("content_block")[0].get('block_type') == 10000:
response_content += json_content.get("content").get("content_block")[0].get("content").get(
"text_block").get("text")
if is_think:
patch_op = json_content.get('patch_op')
if patch_op:
if patch_op[0]:
patch_value = patch_op[0].get('patch_value')
if patch_value:
if patch_value.get('ext'):
if patch_value.get('ext').get('sp_v2'):
for i in json.loads(patch_value.get('ext').get('sp_v2')):
suggestions.add(i.get('content'))
content_block = patch_value.get('content_block')
if content_block:
if content_block[0].get('block_type') == 10000:
text_block = content_block[0].get('content').get('text_block').get('text', '')
if think_bool:
think_content += text_block
if response_bool:
response_content += text_block
if content_block[0].get('block_type') == 10025:
if think_bool:
if content_block[0].get('content').get('search_query_result_block').get(
'results') is not None:
search_keyword.extend(
content_block[0].get('content').get('search_query_result_block').get(
'queries'))
think_content += "\n\n**"
think_content += content_block[0].get('content').get(
'search_query_result_block').get('summary', '')
think_content += "**\n\n"
for i in content_block[0].get('content').get(
'search_query_result_block').get(
'results'):
url_list.append(i.get('text_card'))
think_bool = False
response_bool = True
continue
if content_block[0].get('block_type') == 10040:
think_bool = False
response_bool = True
continue
if content_block[0].get('block_type') == 10050:
for i in content_block[0].get('content').get('rich_media_block').get('creations'):
rich_media_block.append(i.get('video'))
if think_bool:
if isinstance(json_content.get('text'), str):
think_content += json_content.get('text')
if response_bool:
if isinstance(json_content.get('text'), str):
response_content += json_content.get('text')
else:
patch_op = json_content.get('patch_op')
if response_content:
if isinstance(json_content.get('text'), str):
response_content += json_content.get('text')
if patch_op:
target_item = next((item for item in patch_op if item.get('patch_object') == 102), None)
if target_item:
content_str = target_item.get('patch_value').get('content')
if content_str:
content_json = json.loads(content_str)
response_content += content_json.get('text')
if content_json.get('search_references'):
for i in content_json.get('search_references'):
url_list.append(i.get('text_card'))
content_item = next((item for item in patch_op if item.get('patch_type') == 1), None)
if content_item:
patch_value = content_item.get('patch_value')
if patch_value:
content_block = patch_value.get('content_block')
if content_block:
if content_block[0].get('block_type') == 10000 and content_block[0].get('is_finish'):
response_bool = True
if content_block[0].get('block_type') == 10000 and content_block[0].get('is_finish'):
response_bool = False
if content_block[0].get('block_type') == 10000 and content_block[0].get('meta_info'):
media_list = []
poi_list = []
meta_info = content_block[0].get('meta_info')
for meta in meta_info:
if meta.get('type') == 202:
if meta.get('info'):
if json.loads(meta.get('info')):
media = json.loads(meta.get('info')).get('media')
for m in media:
if m.get('type') == 4:
if m.get('applet'):
if m.get('applet').get('render_data'):
render_data = json.loads(m.get('applet').get('render_data'))
if render_data.get('widget_data'):
widget_data = json.loads(render_data.get('widget_data'))
if widget_data:
try:
widget_data_data = json.loads(
widget_data.get('data'))
except Exception as e:
continue
if widget_data_data:
pro_data = widget_data_data.get('data')
if pro_data:
for pr in pro_data:
if isinstance(pr,dict):
product = {
'text': pr.get('text', ''),
'seller_name': pr.get(
'seller_name', ''),
'image_url': pr.get(
'image_url', ''),
'pid': pr.get('pid', '')
}
media_list.append(product)
poi_data = widget_data_data.get(
'poi_list')
if poi_data:
for po in poi_data:
image_list = po.get(
"image_list") or []
poi = {
"poi_name": po.get(
'poi_name'),
"image_list": image_list,
"poi_url": f"https://life-share.scsjsd.com/falcon/poi_mwa/poi_detail?poi_id={po.get('poi_id')}",
"price": po.get('price',
''),
"score": po.get('score',
''),
"rank": po.get('rank', '')
}
poi_list.append(poi)
# 电商数据备用
# if meta.get('tag_info'):
# # print(meta.get('tag_info'))
# media = json.loads(meta.get('tag_info')).get('media')
# for m in media:
# if m.get('type') == 4:
# if m.get('applet'):
# if m.get('applet').get('render_data'):
# render_data = json.loads(
# m.get('applet').get('render_data'))
# if render_data.get('widget_data'):
# widget_data = json.loads(
# render_data.get('widget_data'))
#
# if widget_data:
# widget_data_data = widget_data.get('data')
#
# if isinstance(widget_data_data,str):
# widget_data_data = json.loads(widget_data_data)
# print(widget_data_data)
# if widget_data_data:
# pro_data = widget_data_data.get('data')
# if pro_data:
# for pr in pro_data:
# product = {
# 'text': pr.get('text', ''),
# 'seller_name': pr.get(
# 'seller_name', ''),
# 'image_url': pr.get('image_url',
# ''),
# 'pid': pr.get('pid', '')
# }
# print(product)
# if meta.get('')
if media_list:
response_content += 'render_ecom_card_widget_product_start:'
media_str = json.dumps(media_list, ensure_ascii=False)
response_content += media_str
response_content += 'render_ecom_card_widget_product_end:'
if poi_list:
response_content += 'render_ecom_card_widget_poi_start:'
poi_str = json.dumps(poi_list, ensure_ascii=False)
response_content += poi_str
response_content += 'render_ecom_card_widget_poi_end:'
if content_block[0].get('block_type') == 10000 and content_block[0].get(
'patch_type') == 1:
text_block = content_block[0].get('content').get('text_block').get('text', '')
response_content += text_block
if content_block[0].get('block_type') == 10025:
if content_block[0].get('content').get('search_query_result_block').get(
'results') is not None:
search_keyword.extend(
content_block[0].get('content').get('search_query_result_block').get(
'queries'))
# think_content += "\n\n**"
# think_content += content_block[0].get('content').get(
# 'search_query_result_block').get('summary', '')
# think_content += "**\n\n"
for i in content_block[0].get('content').get('search_query_result_block').get(
'results'):
if i.get('text_card'):
url_list.append(i.get('text_card'))
if i.get('video_card'):
rich_media_block.append(i.get('video_card'))
suggest_item = next((item for item in patch_op if item.get('patch_object') == 50), None)
if suggest_item:
if suggest_item.get('patch_value'):
if suggest_item.get('patch_value').get('ext'):
if suggest_item.get('patch_value').get('ext').get('has_suggest') == '1':
for i in json.loads(suggest_item.get('patch_value').get('ext').get('sp_v2')):
suggestions.add(i.get('content'))
else:
data_value = json_content.get('data')
if isinstance(data_value, list):
if data_value[0].get('cmd') == 'Append':
gen = data_value[0].get('display').get('display').get('generation_spans')
if gen:
if gen[0].get('type') == 2:
response_content += gen[0].get('text').get('content')
suggestions = list(suggestions)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions, rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions, rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
aa = ['5856d641-8d06-4cff-a244-bd5d1a5cdae7']
# file_path = 'geo/c7eb465e-f385-4aa2-89c4-a7cf11897f45/KIMI/1(1).txt'
for a in aa:
file_path = f'geo/{a}/DOUBA/original.text'
doubao_mobile_process_original_data(file_path)
# print(file_path)
# file_path = 'geo/da4ee3sssa2-4c8e-ccc4776-aaaaaaa587-40asas7543ssssssaf6b9d//original.text'
# file_path = 'geo/da4ee3sssa2aaaa-4c8e-ccc4776-aaaaaaa587-40asas7543ssssssaf6b9d/DOUBA/original.text'
import json
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils, tos_utils, bh_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def doubao_process_original_data(file_path):
url_list = ""
think_content = ""
response_content = ""
search_keyword = []
suggestions = []
is_think = False
rich_media_block = []
think_bool = False
response_bool = False
try:
original_content = get_string_from_tos(file_path)
content_list = original_content.split("\n")
for i in content_list:
if i != "":
if not i.startswith("data:"):
continue
payload = i[len("data:"):].lstrip()
if not payload:
continue
try:
json_content = json.loads(payload)
except (IndexError, json.JSONDecodeError):
continue
if json_content.get('event_type') == 2001:
even_data = json.loads(json_content.get('event_data'))
message_data = even_data.get('message')
if even_data.get('tts_content') is not None:
response_content = even_data.get('tts_content')
if message_data.get('content_type') == 2007 :
for i in json.loads(message_data.get('content')).get("search_result").get("video_card").get("card_list"):
rich_media_block.append(i)
if message_data.get('content_type') == 10040 and message_data.get('is_finish') is None:
think_bool = True
continue
if message_data.get('content_type') == 10040 and message_data.get('is_finish') == True:
think_bool = False
continue
if think_bool:
if json.loads(message_data.get('content')).get('text') is not None:
think_content += json.loads(message_data.get('content')).get('text')
content_json = json.loads(message_data.get('content'))
if message_data.get('content_type') == 10025 and content_json.get('results') is not None:
think_content += "\n\n"
think_content += "**搜索"
think_content += str(len(content_json.get('queries')))
think_content += "个关键词,参考"
think_content += str(len(content_json.get('results')))
think_content += "篇文章**"
think_content += "\n\n"
if message_data.get('content_type') == 10025:
content_json = json.loads(message_data.get('content'))
if content_json.get('queries') is not None and content_json.get('results') is not None:
search_keyword = search_keyword + json.loads(message_data.get('content')).get('queries')
if content_json.get('scene') == 2:
url_list = content_json.get('results')
if message_data.get('content_type') == 2002:
suggestions = suggestions + json.loads(message_data.get('content')).get('suggestions')
else:
if json_content.get("patch_op"):
if json_content.get('patch_op')[0].get("patch_object") == 111:
if json_content.get('patch_op')[0].get("patch_value").get("tts_content"):
response_content += json_content.get('patch_op')[0].get("patch_value").get("tts_content")
if json_content.get('patch_op')[0].get("patch_object") == 1:
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block"):
search_keyword = json_content.get('patch_op')[0].get("patch_value").get("content_block")[
0].get("content").get("search_query_result_block").get("queries")
url_list = json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block").get("results")
if is_think == True and json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block").get("results") is not None:
think_content += "\n\n"
think_content += "**"
think_content+=json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"content").get("search_query_result_block").get("summary")
think_content += "**"
think_content += "\n\n"
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"block_type") ==10000 and json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"parent_id") and len(json_content.get('patch_op'))>1:
is_think = True
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get("content").get('text_block').get("text"):
think_content+=json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get("content").get("text_block").get("text")
continue
if json_content.get('patch_op')[0].get("patch_value").get("content_block")[0].get(
"block_type") == 10040:
is_think = False
continue
if json_content.get('patch_op')[0].get("patch_object") == 50:
for sug in json.loads(
json_content.get('patch_op')[0].get("patch_value").get("ext").get("sp_v2")):
suggestions.append(sug.get("content"))
if is_think:
if json_content.get("text"):
think_content += json_content.get("text")
else:
if json_content.get("content"):
content_block = json_content.get("content").get('content_block')
if content_block:
if len(content_block)>0:
if content_block[0].get('block_type') ==10050:
for i in content_block[0].get('content').get('rich_media_block').get('creations'):
rich_media_block.append(i.get('video'))
if content_block[0].get('block_type') ==10000:
if content_block[0].get("content").get("text_block").get("text"):
response_content = content_block[0].get("content").get("text_block").get("text")
suggestions = list(set(suggestions))
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions,rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions,rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path = 'geo/21430906-2656-4ba4-b884-5f937745734c/DB/original.text'
doubao_process_original_data(file_path)
import json
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def douyin_ai_process_original_data(file_path):
"""处理文件内容,提取思考内容、响应内容和URL列表"""
url_list = []
think_content = ""
response_content = ""
search_keyword = []
is_think = False
think_bool = False
rich_media_block = []
suggestions = []
response_bool = False
try:
original_content = get_string_from_tos(file_path)
# 按空行分割内容,过滤空字符串
content_list = [item for item in original_content.split("\n") if item]
for item in content_list:
if item.startswith("data:"):
try:
# 提取并解析JSON数据
data_str = item.split("data:")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue
if json_data.get('data'):
if json_data.get('data')[0]:
for cur in json_data.get('data'):
if not cur:
continue
display_obj = cur.get('display')
if display_obj and display_obj.get('display'):
display = display_obj.get('display')
if display.get('generation_spans'):
if display.get('generation_spans')[0].get('type') ==2:
response_content+=display.get('generation_spans')[0].get('text').get('content')
if display.get('generation_spans')[0].get('type') ==11:
think_content+=display.get('generation_spans')[0].get('cot').get('text')
if display.get('generation_spans')[0].get('cot').get('search_queries'):
think_content += '\n'+'\n'.join(display.get('generation_spans')[0].get('cot').get('search_queries'))+'\n\n'
if display.get('generation_spans')[0].get('type') == 0:
for i in json_data.get("data"):
if i.get("container_info"):
if i.get("container_info").get("business_data"):
if i.get("container_info").get("business_data")[0].get("type") == 999:
if json.loads(i.get("container_info").get("business_data")[0].get("data").get("raw_data")).get("display"):
if json.loads(i.get("container_info").get("business_data")[0].get("data").get("raw_data")).get("display").get("dis_video_info"):
for video in json.loads(i.get("container_info").get("business_data")[0].get("data").get("raw_data")).get("display").get("dis_video_info"):
rich_media_block.append(video)
if display.get('generation_refs'):
if len(url_list) >0:
continue
for i in display.get('generation_refs'):
if i.get('bottom_source_info'):
url_list.append(i)
container_info = json_data.get('data')[0].get('container_info')
if container_info:
business_data = container_info.get('business_data')
if business_data:
if business_data[0].get('data').get('bytesync_data') and response_content == "":
if len(business_data[0].get('data').get('bytesync_data'))>0:
if json.loads(business_data[0].get('data').get('bytesync_data')[0]).get('display').get('generation_spans'):
for cn in json.loads(business_data[0].get('data').get('bytesync_data')[0]).get('display').get('generation_spans'):
if cn.get('type') ==2:
response_content+=cn.get('text').get('content')
if business_data[0].get('data').get('raw_data'):
video_list= json.loads(business_data[0].get('data').get('raw_data')).get('display').get('dis_video_info')
if video_list:
rich_media_block.extend(video_list)
suggestions_list = json.loads(business_data[0].get('data').get('raw_data')).get('display').get('query_list')
if suggestions_list:
suggestions = [item['content'] for item in suggestions_list]
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions,rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions,rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path = "geo/8004e38c-37da-4e82-b08f-453ccdf0d661/DYAI/original.text"
douyin_ai_process_original_data(file_path)
import json
from aidso_geo.utils import robot_utils, bh_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.bh_utils import query_data
from aidso_geo.utils.tos_utils import get_string_from_tos
from aidso_geo.models import spider_save_tos
def kimi_process_original_data(file_path):
url_list = []
think_content = ""
response_content = ""
search_keyword = []
suggestions = []
is_think = False
think_bool = False
response_bool = False
try:
original_content = get_string_from_tos(file_path)
original_content_list = json.loads(original_content)
if isinstance(original_content_list, list):
for item in original_content_list:
block_data = item.get('block', {})
think_data = block_data.get('think', {})
text_data = block_data.get('text', {})
if item.get('mask') == 'block.search' or item.get('mask') == 'block.search.keywords':
search_keyword.append(block_data.get('search').get('keywords')[0])
# if item.get('mask') == 'block.search.webPages':
# url_list.extend(block_data.get('search').get('webPages'))
if item.get('mask') =='message.refs.usedSearchChunks':
searchChunks = item.get('message').get("refs").get('usedSearchChunks')
if searchChunks:
for i in searchChunks:
new_item = i['base'].copy()
new_item['id'] = i['id']
new_item['refIndex'] = i.get('refIndex')
url_list.append(new_item)
if item.get('mask') =='message.refs.searchChunks' and len(url_list) ==0:
searchChunks = item.get('message').get("refs").get('searchChunks')
if searchChunks:
for i in searchChunks:
new_item = i['base'].copy()
new_item['id'] = i['id']
new_item['refIndex'] = i.get('refIndex')
url_list.append(new_item)
if item.get('mask') == 'block.think.content' or item.get('mask') == 'block.think':
if think_data.get('content'):
think_content += think_data.get('content')
if item.get('mask') == 'block.text.content' or item.get('mask') == 'block.text':
if text_data.get('content'):
response_content += text_data.get('content')
if item.get('mask') == 'block.error' and response_content == "":
response_content+=item.get('block').get('error').get("localizedMessage").get("message")
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content, suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
result = bh_utils.query_data("select * from geo_commit_task where platform = 'KIMI' and insertime >1778428800 order by insertime asc ")
# # result = []
for i in result:
file_path = f"geo/{i.get('taskId')}/KIMI/original.text"
kimi_process_original_data(file_path)
# task = '2b2f460d-3ed6-4e0b-940c-9d1c47e27474'
# file_path = f"geo/{task}/KIMI/original.text"
# kimi_process_original_data(file_path)
# aa = 'https://douchacha-web.tos-cn-beijing.volces.com/geo/2ac65e0ef17642eaa42d440e572f693b/KIMI/original.text'
import json
import aidso_geo.utils.bh_utils as bh_utils
import time
import os, sys
import ast
import requests
from copy import deepcopy
from datetime import datetime
from enum import Enum
from aidso_geo.core.down_load_bot import get_req_id
from aidso_geo.utils.tos_utils import put_string_to_tos, check_file_in_tos, get_tos_file_size
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_DIR)
# BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# sys.path.append(BASE_DIR)
from aidso_geo.config.base_config import init_redis
from aidso_geo.utils.ai_utils import ai_get_brand_sentiment_and_mentions, \
ai_get_brand_sentiment_and_mentions_by_articles, ai_get_product_list, ai_get_product_sentiment_and_mentions, \
ai_get_product_list_search
from aidso_geo.models import doubao_data_process, deepseek_data_process, yuanbao_data_process, qianwen_data_process, \
kimi_data_process, wenxin_data_process, baiduai_data_process, doubao_android_data_process, \
deepseek_android_data_process, douyinai_data_process, qianwen_android_data_process, yuanbao_android_data_process, \
xiaohongshu_android_data_process
from aidso_geo.utils import ai_utils, spider_interface, ai_interface, url_utils, robot_utils
from aidso_geo.utils import tos_utils
from loguru import logger
redis_client = init_redis()
def _cache_set_json(key: str, obj: dict, ttl: int):
redis_client.set(key, json.dumps(obj, ensure_ascii=False), ex=ttl)
def commit_task(data, process):
try:
data["insertime"] = int(time.time())
data["status"] = process
bh_utils.insert_data('geo_commit_task', [data])
# bh_utils.insert_data('geo_commit_task_log', [data])
return True
except Exception as e:
return False
def get_keyword_ranks(text, keywords):
if isinstance(keywords, str):
keywords = [keywords]
appearance_order = []
start_pos = 0
text_length = len(text)
while start_pos < text_length:
min_position = None
found_key = None
for key in keywords:
pos = text.find(key, start_pos)
if pos == -1:
continue
# 1. 更早出现的优先
# 2. 如果起始位置相同,优先更短的关键词
if (
min_position is None
or pos < min_position
or (pos == min_position and len(key) < len(found_key))
):
min_position = pos
found_key = key
if found_key is None:
break
appearance_order.append(found_key)
start_pos = min_position + len(found_key)
result = []
for key in keywords:
rank_list = [i + 1 for i, k in enumerate(appearance_order) if k == key]
result.append({"word": key, "rank_list": rank_list})
return result
def calculate_rank_min_and_count_sum(data_list):
positive_ranks = [item['rank'] for item in data_list if item['rank'] > 0]
counts = [item['count'] for item in data_list]
min_rank = min(positive_ranks) if positive_ranks else 0
count_sum = sum(counts)
return (min_rank, count_sum)
def process_com_map(data):
if data.get("comWordsMap") is not None:
com_words_map = data.get("comWordsMap")
# 如果 comWordsMap 是字符串,解析成 list
if isinstance(com_words_map, str):
try:
com_words_map = json.loads(com_words_map)
except Exception:
com_words_map = ast.literal_eval(com_words_map)
# 兜底:保证是 list
if not isinstance(com_words_map, list):
com_words_map = []
# 再兜底:过滤掉脏数据,避免里面混入 str
com_words_map = [x for x in com_words_map if isinstance(x, dict)]
# 写回 data,后面所有逻辑都用标准结构
data["comWordsMap"] = com_words_map
return data
def keyword_map_brand(keyword_list):
def sql_quote(s: str) -> str:
# 最基础的单引号转义
return "'" + s.replace("\\", "\\\\").replace("'", "\\'") + "'"
brand_map = []
in_list = ",".join(sql_quote(x) for x in keyword_list)
query_result = []
if in_list:
query_result = bh_utils.query_data(
f"select brand,keyword from geo_keyword_brand_map where keyword in ({in_list}) group by keyword") or []
found_keywords = {row.get("keyword") for row in (query_result or []) if row.get("keyword")}
missing_keywords = [kw for kw in keyword_list if kw not in found_keywords]
if missing_keywords:
ai_brand = ai_utils.ai_get_brand_name(missing_keywords) or []
if ai_brand:
bh_utils.insert_data("geo_keyword_brand_map", ai_brand)
brand_map.extend(ai_brand)
brand_map.extend(query_result)
return brand_map
def extract_keywords(data):
if not data:
return []
result = []
for item in data:
if isinstance(item, str):
if item.strip():
result.append(item.strip())
elif isinstance(item, dict):
keywords = item.get("keywords", [])
if isinstance(keywords, list):
for kw in keywords:
if isinstance(kw, str) and kw.strip():
result.append(kw.strip())
elif isinstance(keywords, str) and keywords.strip():
result.append(keywords.strip())
return result
def convert_rank_data(data):
result = []
for item in data:
rank_list = item.get("rank_list", []) or []
result.append({
"word": item.get("word"),
"rank": min(rank_list) if rank_list else 0,
"count": len(rank_list)
})
return result
def cache_get_ai_brand_list(taskId, platform, response_content, prompt):
result = bh_utils.query_data(
f"select * from geo_ai_rank_list where taskId = '{taskId}' and platform = '{platform}' limit 1"
)
if result:
db_row = result[0]
ai_brand_list = db_row.get("ai_brand_list")
if ai_brand_list:
if isinstance(ai_brand_list, str):
try:
return json.loads(ai_brand_list)
except Exception:
return ai_brand_list
return ai_brand_list
ai_result = ai_utils.ai_get_brand_list(response_content, prompt)
bh_utils.insert_data("geo_ai_rank_list", [{"taskId": taskId, "platform": platform, "ai_brand_list": ai_result}])
return ai_result
def cache_get_ai_product_list(taskId, platform, response_content, prompt):
result = bh_utils.query_data(
f"select * from geo_ai_product_list where taskId = '{taskId}' and platform = '{platform}' limit 1"
)
if result:
db_row = result[0]
ai_brand_list = db_row.get("ai_brand_list")
if ai_brand_list:
if isinstance(ai_brand_list, str):
try:
return json.loads(ai_brand_list)
except Exception:
return ai_brand_list
return ai_brand_list
ai_result_search = ai_utils.ai_get_product_list(response_content, prompt)
ai_result = ai_get_product_list_search(ai_result_search)
bh_utils.insert_data("geo_ai_product_list", [{"taskId": taskId, "platform": platform, "ai_brand_list": ai_result}])
return ai_result
def split_word_rank(ai_word_list, brand_words, com_word_list, all_word_rank_list):
word_rank_map = {}
for item in all_word_rank_list:
word = item.get("word")
if not word:
continue
word_rank_map[word] = {
"rank": item.get("rank", 0),
"count": item.get("count", 0),
"brand": item.get("brand", '没有品牌'),
}
def build_result(word_list):
result = []
if word_list:
for word in word_list:
rank_info = word_rank_map.get(word, {})
result.append({
"word": word,
"rank": rank_info.get("rank", 0),
"count": rank_info.get("count", 0),
"brand": rank_info.get("brand", '没有品牌')
})
return result
ai_word_list_with_rank = build_result(ai_word_list)
brand_words_rank = build_result(brand_words)
com_word_list_rank = build_result(com_word_list)
return ai_word_list_with_rank, brand_words_rank, com_word_list_rank
def keyword_with_brand(all_keyword_with_rank_sentiment, all_keyword_with_brand):
keyword_brand_map = {
item.get("keyword"): item.get("brand", "没有品牌")
for item in all_keyword_with_brand
if item.get("keyword")
}
for item in all_keyword_with_rank_sentiment:
item["brand"] = keyword_brand_map.get(item.get("word"), "没有品牌")
return all_keyword_with_rank_sentiment
def build_brand_summary(com_word_list_rank, com_brand):
# 先构造 word -> 明细 的映射
word_detail_map = {
item.get("word"): item
for item in com_word_list_rank
if item.get("word") is not None
}
result = []
for brand_item in com_brand:
brand_name = brand_item.get("brand", "")
keywords = brand_item.get("keywords") or []
# 收集该品牌下的所有关键词明细
vos = []
for kw in keywords:
detail = word_detail_map.get(kw)
if detail:
vos.append(detail.copy())
else:
# 如果关键词在 com_word_list_rank 里不存在,补一个默认结构
vos.append({
"word": kw,
"rank": 0,
"count": 0
})
# 计算外层 rank
non_zero_ranks = [item.get("rank", 0) for item in vos if item.get("rank", 0) != 0]
final_rank = min(non_zero_ranks) if non_zero_ranks else 0
# 计算外层 count
final_count = sum(item.get("count", 0) or 0 for item in vos)
result.append({
"word": brand_name,
"count": final_count,
"rank": final_rank,
"vos": vos
})
return result
def pick_brand_from_vos(vos):
if not vos:
return "没有品牌"
# 1. 先找 rank 非 0 的,取 rank 最大的那条
ranked_items = [item for item in vos if item.get("rank", 0) != 0]
if ranked_items:
best = max(ranked_items, key=lambda x: x.get("rank", 0))
return best.get("brand", "没有品牌") or "没有品牌"
# 2. 如果全是 0,取第一个不是“没有品牌”的
for item in vos:
brand = item.get("brand")
if brand and brand != "没有品牌":
return brand
# 3. 如果全都是“没有品牌”
return "没有品牌"
def fill_outer_brand(data):
result = []
for item in data:
new_item = item.copy()
vos = new_item.get("vos", [])
new_item["brand"] = pick_brand_from_vos(vos)
result.append(new_item)
return result
def filter_ai_word_list(ai_word_list_with_rank, brand_words_rank, com_word_list_rank):
# 收集 brand_words_rank 里的 brand
def normalize_brand(brand):
if not brand:
return ""
return str(brand).strip().lower()
brand_words_brand_set = {
normalize_brand(item.get("brand"))
for item in brand_words_rank
if item.get("brand")
}
com_words_brand_set = {
normalize_brand(item.get("brand"))
for item in com_word_list_rank
if item.get("brand")
}
exclude_brand_set = brand_words_brand_set | com_words_brand_set
result = []
for item in ai_word_list_with_rank:
brand = item.get("brand")
if brand == "没有品牌":
result.append(item)
continue
if normalize_brand(brand) not in exclude_brand_set:
result.append(item)
return result
def merge_brand_rank_by_subset(brand_words_rank, ai_word_list_with_rank):
"""
规则:
1. 第一轮:word 完全相等(忽略大小写)时,直接匹配
2. 第二轮:即使 brand_item 已经有 rank/count,也继续参与匹配:
- word 双向子串匹配(忽略大小写)
3. 匹配成功后:
- 第一轮:仍然保持原逻辑,直接用 ai 的 rank / count 覆盖
- 第二轮:
- rank 取更小的非 0 值
- count 取更大的值
- 删除该 ai 元素(返回剩余 ai 列表)
4. 一个 ai 元素只能匹配一次
"""
def to_int(v):
try:
return int(v or 0)
except Exception:
return 0
def merge_rank(old_rank, old_count, new_rank, new_count):
old_rank = to_int(old_rank)
old_count = to_int(old_count)
new_rank = to_int(new_rank)
new_count = to_int(new_count)
# rank:取更小的非0值,0 视为无效
if old_rank > 0 and new_rank > 0:
rank = min(old_rank, new_rank)
elif old_rank > 0:
rank = old_rank
elif new_rank > 0:
rank = new_rank
else:
rank = 0
# count:取更大值
count = max(old_count, new_count)
return rank, count
brand_list = deepcopy(brand_words_rank)
ai_list = deepcopy(ai_word_list_with_rank)
used_ai_index = set()
first_round_matched_brand_index = set()
# ========== 第一轮:word 完全相等,忽略 brand ==========
for brand_index, brand_item in enumerate(brand_list):
brand_word = str(brand_item.get("word", "") or "").strip()
brand_word_lower = brand_word.lower()
if not brand_word_lower:
continue
for ai_index, ai_item in enumerate(ai_list):
if ai_index in used_ai_index:
continue
ai_word = str(ai_item.get("word", "") or "").strip()
ai_word_lower = ai_word.lower()
if not ai_word_lower:
continue
if brand_word_lower == ai_word_lower:
# 第一轮保持原逻辑:直接覆盖
brand_item["rank"] = ai_item.get("rank", 0)
brand_item["count"] = ai_item.get("count", 0)
used_ai_index.add(ai_index)
first_round_matched_brand_index.add(brand_index)
break
# ========== 第二轮:word 双向子串匹配,忽略 brand ==========
for brand_index, brand_item in enumerate(brand_list):
# 这里不再跳过第一轮命中的 brand_item
brand_word = str(brand_item.get("word", "") or "").strip()
brand_word_lower = brand_word.lower()
if not brand_word_lower:
continue
for ai_index, ai_item in enumerate(ai_list):
if ai_index in used_ai_index:
continue
ai_word = str(ai_item.get("word", "") or "").strip()
ai_word_lower = ai_word.lower()
if not ai_word_lower:
continue
if brand_word_lower in ai_word_lower or ai_word_lower in brand_word_lower:
merged_rank, merged_count = merge_rank(
brand_item.get("rank", 0),
brand_item.get("count", 0),
ai_item.get("rank", 0),
ai_item.get("count", 0),
)
brand_item["rank"] = merged_rank
brand_item["count"] = merged_count
used_ai_index.add(ai_index)
# 这里不要 break,继续把后面还能匹配的 ai 一并吃掉
remain_ai_list = [
item for idx, item in enumerate(ai_list)
if idx not in used_ai_index
]
return brand_list, remain_ai_list
def to_int(v):
try:
return int(v or 0)
except Exception:
return 0
def merge_rank(old_rank, old_count, new_rank, new_count):
old_rank = to_int(old_rank)
old_count = to_int(old_count)
new_rank = to_int(new_rank)
new_count = to_int(new_count)
# rank:取更小的非 0 值
if old_rank > 0 and new_rank > 0:
rank = min(old_rank, new_rank)
elif old_rank > 0:
rank = old_rank
elif new_rank > 0:
rank = new_rank
else:
rank = 0
# count:取更大值
count = max(old_count, new_count)
return rank, count
def merge_com_and_ai_by_subset(brand_words_rank, ai_word_list_with_rank):
"""
规则:
1. 第一轮:vos 中的 word 和 ai.word 完全相等(忽略大小写)时,直接匹配
2. 第二轮:即使 vos 已经有 rank/count,也继续参与匹配:
- word 双向子串匹配(忽略大小写)
3. 匹配成功后:
- 第一轮:直接用 ai 的 rank / count 覆盖 vos 的 rank / count
- 第二轮:
- rank 取更小的非 0 值
- count 取更大的值
- 删除该 ai 元素(返回剩余 ai 列表)
4. 一个 ai 元素只能匹配一次
5. 外层 brand_item 的 rank / count 不变,只更新 vos
"""
brand_list = deepcopy(brand_words_rank)
ai_list = deepcopy(ai_word_list_with_rank)
used_ai_index = set()
# ========== 第一轮:vos.word 完全相等,忽略 brand ==========
for brand_item in brand_list:
vos = brand_item.get("vos", [])
if not isinstance(vos, list):
continue
for vo in vos:
vo_word = str(vo.get("word", "") or "").strip()
vo_word_lower = vo_word.lower()
if not vo_word_lower:
continue
for ai_index, ai_item in enumerate(ai_list):
if ai_index in used_ai_index:
continue
ai_word = str(ai_item.get("word", "") or "").strip()
ai_word_lower = ai_word.lower()
if not ai_word_lower:
continue
if vo_word_lower == ai_word_lower:
vo["rank"] = ai_item.get("rank", 0)
vo["count"] = ai_item.get("count", 0)
used_ai_index.add(ai_index)
break
# ========== 第二轮:vos.word 双向子串匹配,忽略 brand ==========
for brand_item in brand_list:
vos = brand_item.get("vos", [])
if not isinstance(vos, list):
continue
for vo in vos:
vo_word = str(vo.get("word", "") or "").strip()
vo_word_lower = vo_word.lower()
if not vo_word_lower:
continue
for ai_index, ai_item in enumerate(ai_list):
if ai_index in used_ai_index:
continue
ai_word = str(ai_item.get("word", "") or "").strip()
ai_word_lower = ai_word.lower()
if not ai_word_lower:
continue
if vo_word_lower in ai_word_lower or ai_word_lower in vo_word_lower:
merged_rank, merged_count = merge_rank(
vo.get("rank", 0),
vo.get("count", 0),
ai_item.get("rank", 0),
ai_item.get("count", 0),
)
vo["rank"] = merged_rank
vo["count"] = merged_count
used_ai_index.add(ai_index)
# 第二轮不 break,继续吃后面还能匹配的 ai
remain_ai_list = [
item for idx, item in enumerate(ai_list)
if idx not in used_ai_index
]
return brand_list, remain_ai_list
def rebuild_count_and_rank(data_list):
"""
重新计算外层 count 和 rank
规则:
1. 外层 count = vos 中所有 count 之和
2. 外层 rank = vos 中所有大于 0 的 rank 的最小值
3. 如果 vos 中没有大于 0 的 rank,则外层 rank = 0
"""
result = deepcopy(data_list)
for item in result:
vos = item.get("vos", [])
if not isinstance(vos, list):
item["count"] = 0
item["rank"] = 0
continue
total_count = 0
rank_list = []
for vo in vos:
if not isinstance(vo, dict):
continue
count = vo.get("count", 0) or 0
rank = vo.get("rank", 0) or 0
total_count += count
if rank > 0:
rank_list.append(rank)
item["count"] = total_count
item["rank"] = min(rank_list) if rank_list else 0
return result
def get_all_brands(*data_lists):
brands = []
seen = set()
for data_list in data_lists:
if not data_list:
continue
for item in data_list:
brand = item.get("brand")
if not brand or brand == "没有品牌":
brand = item.get("word")
if brand and brand not in seen:
seen.add(brand)
brands.append(brand)
return brands
def get_all_words(*data_lists):
brands = []
seen = set()
for data_list in data_lists:
if not data_list:
continue
for item in data_list:
brand = item.get("word")
seen.add(brand)
brands.append(brand)
return brands
def parse_mentions(value):
if not value:
return ""
if isinstance(value, list):
return ", ".join(str(x) for x in value if x)
if isinstance(value, str):
try:
arr = json.loads(value)
if isinstance(arr, list):
return ", ".join(str(x) for x in arr if x)
except Exception:
return value
return ""
def cache_ai_get_brand_sentiment_and_mentions(prompt, brands, response_content, task_id, platform, req_id):
if not brands:
return []
seen = set()
unique_brand_items = []
for item in brands:
if not isinstance(item, dict):
continue
brand = item.get("brand")
if brand and brand not in seen:
seen.add(brand)
unique_brand_items.append({
"brand": brand,
"keyword": item.get("keyword", ""),
"type": item.get("type", "")
})
if not unique_brand_items:
return []
unique_brands = [item["brand"] for item in unique_brand_items]
# brand -> keyword/type 映射,插入 geo_brand_mention_list
brand_info_map = {
item["brand"]: {
"keyword": item.get("keyword", ""),
"type": item.get("type", "")
}
for item in unique_brand_items
}
# SQL 单引号转义
def _escape_sql(s):
return str(s).replace("'", "''")
in_str = ",".join(f"'{_escape_sql(brand)}'" for brand in unique_brands)
sql = f"""
select *
from geo_ai_brand_sentiment_and_mentions
where taskId = '{_escape_sql(task_id)}'
and platform = '{_escape_sql(platform)}'
and brand_name in ({in_str})
"""
db_result = bh_utils.query_data(sql) or []
brand_mention_list = []
for item in db_result:
brand_n = item.get("brand_name")
info = brand_info_map.get(brand_n, {})
brand_mention_list .append(
{
"req_id": req_id,
"taskId": task_id,
"prompt": prompt,
"platform": platform,
"keyword": info.get("keyword", ""),
"type": info.get("type", ""),
"brand_name": item.get("brand_name"),
"positive_mentions": parse_mentions(item.get("positive_mentions")),
"negative_mentions": parse_mentions(item.get("negative_mentions"))
}
)
bh_utils.insert_data("geo_brand_mention_list", brand_mention_list)
# 数据库中已有的 brand
db_brand_set = {
item.get("brand_name")
for item in db_result
if item.get("brand_name")
}
# 找出缺失的 brand
missing_brands = [
brand
for brand in unique_brands
if brand not in db_brand_set
]
# 如果数据库里已经有全部 brands,直接返回
if not missing_brands:
db_map = {
item.get("brand_name"): item
for item in db_result
if item.get("brand_name")
}
return [
db_map[brand]
for brand in unique_brands
if brand in db_map
]
# 只对缺失的 brands 调 AI
ai_result = ai_get_brand_sentiment_and_mentions(missing_brands, response_content) or []
# 只补充原来缓存表需要的字段,不加 keyword/type
for ar in ai_result:
ar["taskId"] = task_id
ar["platform"] = platform
# 插入 AI 缓存表:
if ai_result:
bh_utils.insert_data("geo_ai_brand_sentiment_and_mentions", ai_result)
all_result = db_result + ai_result
# 插入 geo_brand_mention_list:
result = []
for item in all_result:
brand_name = item.get("brand_name")
info = brand_info_map.get(brand_name, {})
result.append({
"req_id": req_id,
"prompt": prompt,
"taskId": task_id,
"platform": platform,
"brand_name": brand_name,
"keyword": info.get("keyword", ""),
"type": info.get("type", ""),
"positive_mentions": parse_mentions(item.get("positive_mentions")),
"negative_mentions": parse_mentions(item.get("negative_mentions"))
})
if result:
bh_utils.insert_data("geo_brand_mention_list", result)
return all_result
def cache_ai_get_product_sentiment_and_mentions(products, response_content, task_id, platform, req_id):
"""
逻辑:
1. 先查数据库中 task_id + platform + brands 的已有结果
2. 如果数据库中已包含所有 brands,直接返回数据库结果
3. 如果只缺部分 brands,则只对缺失 brands 调用 AI
4. 将缺失 brands 的 AI 结果插入数据库
5. 返回完整结果
"""
if not products:
return []
# 去重并保持顺序
seen = set()
unique_brands = []
for brand in products:
if brand and brand not in seen:
seen.add(brand)
unique_brands.append(brand)
# SQL 单引号转义
def _escape_sql(s):
return str(s).replace("'", "''")
in_str = ",".join(f"'{_escape_sql(brand)}'" for brand in unique_brands)
sql = f"""
select *
from geo_ai_product_sentiment_and_mentions
where taskId = '{_escape_sql(task_id)}'
and platform = '{_escape_sql(platform)}'
and brand_name in ({in_str})
"""
db_result = bh_utils.query_data(sql) or []
# 数据库中已有的 brand
db_brand_set = {
item.get("brand_name")
for item in db_result
if item.get("brand_name")
}
# 找出缺失的 brand
missing_brands = [brand for brand in unique_brands if brand not in db_brand_set]
# 如果数据库里已经有全部 brands,直接返回
if not missing_brands:
# 按传入 brands 顺序返回
db_map = {item.get("brand_name"): item for item in db_result if item.get("brand_name")}
return [db_map[brand] for brand in unique_brands if brand in db_map]
# 只对缺失的 brands 调 AI
ai_result = ai_get_product_sentiment_and_mentions(missing_brands, response_content) or []
# 补充公共字段
for ar in ai_result:
ar["taskId"] = task_id
ar["platform"] = platform
# 插入数据库
if ai_result:
bh_utils.insert_data("geo_ai_product_sentiment_and_mentions", ai_result)
all_result = db_result + ai_result
result = [
{
"req_id": req_id,
"brand_name": item.get("brand_name"),
"positive_mentions": parse_mentions(item.get("positive_mentions")),
"negative_mentions": parse_mentions(item.get("negative_mentions"))
}
for item in all_result
]
bh_utils.insert_data("geo_product_mention_list", result)
return all_result
def replace_word_by_brand(data_list):
if not data_list:
return []
result = []
for item in data_list:
if not isinstance(item, dict):
continue
new_item = item.copy()
brand = new_item.get("brand")
if brand and brand != "没有品牌":
new_item["word"] = brand
result.append(new_item)
return result
def keep_min_rank_by_brand(data_list):
"""
相同 brand 只保留一条,保留 rank 最小的那条
"""
brand_map = {}
for item in data_list or []:
brand = item.get("brand")
rank = item.get("rank", 0)
if brand not in brand_map:
brand_map[brand] = item
else:
old_rank = brand_map[brand].get("rank", 0)
if rank < old_rank:
brand_map[brand] = item
return list(brand_map.values())
sentiment_map = {
"正向": "POSITIVE",
"负向": "NEGATIVE",
"中性": "NEUTRAL"
}
def merge_sentiment_to_brand_words(brand_words_list_with_rank, all_sentiment_and_mentions):
# 构建 brand_name -> sentiment 映射
word_sentiment_maps = {}
for item in all_sentiment_and_mentions:
brand_name = item.get("brand_name")
sentiment = item.get("sentiment", "中性")
if brand_name:
word_sentiment_maps[brand_name] = sentiment
result = []
for item in brand_words_list_with_rank:
new_item = dict(item)
word = new_item.get("word")
count = new_item.get("count", 0)
if count > 0:
new_item["sentiment"] = sentiment_map[word_sentiment_maps.get(word, "中性")]
result.append(new_item)
return result
def build_mentions_list(brand_words_list_with_rank, all_sentiment_and_mentions):
valid_words = set()
for item in brand_words_list_with_rank or []:
word = str(item.get("word") or "").strip()
try:
count = int(item.get("count") or 0)
except Exception:
count = 0
if word and count > 0:
valid_words.add(word)
all_positive = []
all_negative = []
for item in all_sentiment_and_mentions or []:
brand_name = str(item.get("brand_name") or "").strip()
if brand_name not in valid_words:
continue
positive_mentions = item.get("positive_mentions") or "[]"
negative_mentions = item.get("negative_mentions") or "[]"
if isinstance(positive_mentions, str):
try:
positive_mentions = json.loads(positive_mentions)
except Exception:
positive_mentions = []
elif not isinstance(positive_mentions, list):
positive_mentions = []
if isinstance(negative_mentions, str):
try:
negative_mentions = json.loads(negative_mentions)
except Exception:
negative_mentions = []
elif not isinstance(negative_mentions, list):
negative_mentions = []
all_positive.extend([str(x).strip() for x in positive_mentions if str(x).strip()])
all_negative.extend([str(x).strip() for x in negative_mentions if str(x).strip()])
return [
{
"negativeMentions": list(dict.fromkeys(all_negative)),
"positiveMentions": list(dict.fromkeys(all_positive))
}
]
def get_first_sentiment(data):
for item in data or []:
sentiment = item.get("sentiment")
if sentiment:
return sentiment
def get_first_score(data):
for item in data or []:
score = item.get("sentimentScore")
if score:
return score
def mark_source_mentioned(source_list, word_list):
result = []
# 去重并去掉空值
words = [w.strip() for w in word_list if w and str(w).strip()]
for item in source_list:
snippet = item.get("snippet", "") or ""
mentioned = 1 if any(word in snippet for word in words) else 0
new_item = {}
new_item["sourceId"] = item.get("sourceId")
new_item["hasBrand"] = mentioned
result.append(new_item)
return result
def merge_mentioned(snippet_mention, quotes):
mentioned_map = {item["sourceId"]: item["hasBrand"] for item in snippet_mention}
for item in quotes:
item["hasBrand"] = mentioned_map.get(item["sourceId"], 0)
return quotes
def merge_sentiment(snippet_mention, quotes):
mentioned_map = {item["sourceId"]: item["sentiment"] for item in snippet_mention}
for item in quotes:
item["sentiment"] = mentioned_map.get(item["sourceId"], "NEUTRAL")
return quotes
def format_mentions(data_list):
for item in data_list:
positive_mentions = item.get("positive_mentions", "[]")
negative_mentions = item.get("negative_mentions", "[]")
try:
item["positive_mentions"] = json.loads(positive_mentions) if isinstance(positive_mentions,
str) else positive_mentions
except Exception:
item["positive_mentions"] = []
try:
item["negative_mentions"] = json.loads(negative_mentions) if isinstance(negative_mentions,
str) else negative_mentions
except Exception:
item["negative_mentions"] = []
return data_list
def process_brand_sentiment_by_snippets(prompt, platform, taskId, reqId, brand_words, snippet_list):
"""
根据 brand_words 和 snippet_list 调用 ai_get_brand_sentiment_and_mentions
参数:
brand_words: list[str]
snippet_list: list[dict],每项至少包含 snippet 字段
返回:
list[dict]
"""
results = []
if not brand_words or not snippet_list:
return results
db_result = bh_utils.query_data(f"select * from geo_quote_mention_list where reqId = '{reqId}'")
if db_result:
return format_mentions(db_result)
# 全量兜底 brand_str
fallback_brand_str = [",".join(dict.fromkeys(brand_words))]
articles_result = ai_get_brand_sentiment_and_mentions_by_articles(fallback_brand_str, snippet_list)
today = datetime.now().strftime("%Y%m%d")
for articles in articles_result:
de = {
"reqId": reqId,
"taskId": taskId,
"platform": platform,
"prompt": prompt,
"sourceId": articles.get("sourceId"),
"sentiment": sentiment_map[articles.get('sentiment', '中性')],
"positive_mentions": articles.get('positive_mentions'),
"negative_mentions": articles.get('negative_mentions'),
"pt": today
}
results.append(de)
bh_utils.insert_data("geo_quote_mention_list", results)
return results
def calc_brand_favorability_simple(data_list):
result = []
for item in data_list:
pos_count = len(item.get("positive_mentions", []) or [])
neg_count = len(item.get("negative_mentions", []) or [])
total = pos_count + neg_count
favorability = int((pos_count / total) * 100) if total > 0 else 50
result.append({
"brand_name": item.get("brand_name"),
"favorability_score": favorability
})
return result
def attach_favorability_score(score_list, word_list):
# 先构建品牌分数字典
score_map = {
item.get("brand_name"): item.get("favorability_score", 0)
for item in score_list
}
result = []
for item in word_list:
new_item = dict(item)
new_item["sentimentScore"] = score_map.get(item.get("brand"), 0)
result.append(new_item)
return result
def attach_favorability_brand_score(score_list, word_list):
# 先构建品牌分数字典
score_map = {
item.get("brand_name"): item.get("favorability_score", 0)
for item in score_list
}
result = []
for item in word_list:
new_item = dict(item)
new_item["sentimentScore"] = score_map.get(item.get("word"), 0)
result.append(new_item)
return result
def replace_no_brand_inplace(data_list):
for item in data_list:
if item.get("brand") == "没有品牌":
item["brand"] = item.get("keyword")
return data_list
def add_keyword_type(ai_word_list, brand_words, com_word_list, all_keyword_with_brand):
"""
根据 keyword 属于哪个 list,给 all_keyword_with_brand 每一项增加 type
type:
- brand: 自有品牌词
- com: 竞品词
- ai: AI 识别词
- unknown: 未匹配
"""
ai_set = set(ai_word_list or [])
brand_set = set(brand_words or [])
com_set = set(com_word_list or [])
result = []
for item in all_keyword_with_brand:
brand = item.get("brand")
keyword = item.get("keyword")
if keyword in brand_set:
word_type = "brand"
elif keyword in com_set:
word_type = "com"
elif keyword in ai_set:
word_type = "ai"
else:
word_type = "unknown"
result.append({
"brand": brand,
"keyword": keyword,
"type": word_type
})
return result
def result_v2(response_content, data):
data = process_com_map(data)
reqId = data.get('reqId')
type = data.get('type')
keywords = data.get('keywords')
product_map = data.get('productWordsMap')
prompt = data.get('prompt')
platform = data.get('platform')
taskId = data.get('taskId')
brand_words = data.get('brandWords', [])
comWords = data.get('comWords', [])
if data.get('comWordsMap'):
comWords = data.get('comWordsMap')
com_word_list = extract_keywords(comWords)
# 获取ai所有的词
# -------------------------
# 获取ai提及词
ai_word_list = cache_get_ai_brand_list(taskId, platform, response_content, prompt)
# 获取所有词
all_word_list = []
if isinstance(ai_word_list, list):
all_word_list += ai_word_list
if isinstance(brand_words, list):
all_word_list += brand_words
if isinstance(com_word_list, list):
all_word_list += com_word_list
brand_words_list_with_rank = []
com_word_list_with_rank = []
ai_word_list_with_rank = []
keywords_list_with_rank = []
product_map_list_with_rank = []
product_map_list_with_rank_result = []
all_product_list_with_rank = []
all_word_set_list = list(set(all_word_list))
## 去除空字符串 死循环
all_word_set_list = [str(x).strip() for x in all_word_set_list if str(x).strip()]
# 获取所有词的排名 word + rank_list
all_word_rank_list = get_keyword_ranks(response_content, all_word_set_list)
# 获取所有词的排名 word + rank + count
all_keyword_with_rank = convert_rank_data(all_word_rank_list)
# 获取所有词的品牌
all_keyword_with_brand = keyword_map_brand(all_word_list)
# 追加所有词的品牌
all_keyword_with_rank_sentiment_brand = keyword_with_brand(all_keyword_with_rank, all_keyword_with_brand)
# 拆分为 all_词 品牌词 竞品词
ai_word_list_with_rank, brand_words_list_with_rank, com_word_list_with_rank = split_word_rank(ai_word_list,
brand_words,
com_word_list,
all_keyword_with_rank_sentiment_brand)
# ---------------------------------调整排名-----------------------------
brand_words_list_with_rank, ai_word_list_with_rank = merge_brand_rank_by_subset(brand_words_list_with_rank,
ai_word_list_with_rank)
# 兼容map场景
if data.get('comWordsMap'):
# 计算出vos与外层数据
com_word_list_with_rank = build_brand_summary(com_word_list_with_rank, comWords)
# 外层品牌
com_word_list_with_rank = fill_outer_brand(com_word_list_with_rank)
# 去除竞品词在ai中的
com_word_list_with_rank, ai_word_list_with_rank = merge_com_and_ai_by_subset(com_word_list_with_rank,
ai_word_list_with_rank)
# 重新计算外层rank count
com_word_list_with_rank = rebuild_count_and_rank(com_word_list_with_rank)
else:
# list 调整竞争品牌词排名 去除竞争品牌词中ai的排名
com_word_list_with_rank, ai_word_list_with_rank = merge_brand_rank_by_subset(com_word_list_with_rank,
ai_word_list_with_rank)
# ---------------------------------调整排名-----------------------------
# 去除all
ai_word_list_with_rank = filter_ai_word_list(ai_word_list_with_rank, brand_words_list_with_rank,
com_word_list_with_rank)
# ---------------------------------处理品牌-----------------------------
ai_word_list_with_rank = replace_word_by_brand(ai_word_list_with_rank)
# 同一品牌只返回一个
ai_word_list_with_rank = keep_min_rank_by_brand(ai_word_list_with_rank)
# ---------------------------------处理品牌-----------------------------
# ---------------------------------情感倾向-----------------------------
# 获取所有品牌 或者词
brands = get_all_brands(ai_word_list_with_rank) + get_all_words(brand_words_list_with_rank, com_word_list_with_rank)
# 获取所有词情感倾向 正面词 负面词
all_keyword_with_brand_replace = replace_no_brand_inplace(all_keyword_with_brand)
brands_with_type = add_keyword_type(ai_word_list, brand_words, com_word_list, all_keyword_with_brand_replace)
all_sentiment_and_mentions = cache_ai_get_brand_sentiment_and_mentions(prompt, brands_with_type, response_content,
taskId, platform, reqId)
all_sentiment_and_mentions_list = format_mentions(all_sentiment_and_mentions)
all_sentiment_and_mentions_score = calc_brand_favorability_simple(all_sentiment_and_mentions_list)
#
brand_words_list_with_rank = merge_sentiment_to_brand_words(brand_words_list_with_rank, all_sentiment_and_mentions)
com_word_list_with_rank = merge_sentiment_to_brand_words(com_word_list_with_rank, all_sentiment_and_mentions)
ai_word_list_with_rank = merge_sentiment_to_brand_words(ai_word_list_with_rank, all_sentiment_and_mentions)
# 计算分
brand_words_list_with_rank = attach_favorability_brand_score(all_sentiment_and_mentions_score,
brand_words_list_with_rank)
com_word_list_with_rank = attach_favorability_brand_score(all_sentiment_and_mentions_score, com_word_list_with_rank)
ai_word_list_with_rank = attach_favorability_score(all_sentiment_and_mentions_score, ai_word_list_with_rank)
mentionsList = build_mentions_list(brand_words_list_with_rank, all_sentiment_and_mentions)
# ---------------------------------情感倾向-----------------------------
# ---------------------------------最小排名最小次数-----------------------------
min_com_rank, com_count_sum = calculate_rank_min_and_count_sum(com_word_list_with_rank)
min_brand_rank, brand_count_sum = calculate_rank_min_and_count_sum(brand_words_list_with_rank)
# ---------------------------------最小排名最小次数-----------------------------
# ---------------------------------关键词-----------------------------
if keywords:
keywords = [str(x).strip() for x in keywords if str(x).strip()]
keywords_rank_list = get_keyword_ranks(response_content, keywords)
keywords_list_with_rank = convert_rank_data(keywords_rank_list)
# ---------------------------------产品词-----------------------------
if product_map:
# 获取所有产品词
product_list = extract_keywords(product_map)
# 获取ai提取产品词
ai_product_list = cache_get_ai_product_list(taskId, platform, response_content, prompt)
if isinstance(ai_product_list, list):
product_list += ai_product_list
product_list = list(set(product_list))
product_list = [str(x).strip() for x in product_list if str(x).strip()]
# 获取所有产品排名
product_rank_list = get_keyword_ranks(response_content, product_list)
all_product_list_with_rank = convert_rank_data(product_rank_list)
# 所有产品
product_all = get_all_words(all_product_list_with_rank)
# 获取产品的情感倾向以及正面次负面词
all_product_sentiment_and_mentions = cache_ai_get_product_sentiment_and_mentions(product_all, response_content,
taskId,
platform, reqId)
# 计算所有的分
all_product_sentiment_and_mentions = format_mentions(all_product_sentiment_and_mentions)
all_product_sentiment_and_mentions_score = calc_brand_favorability_simple(all_product_sentiment_and_mentions)
# 获取产品词的排名
product_map_list_with_rank = build_brand_summary(all_product_list_with_rank, product_map)
# 获取所有词的情感倾向
product_map_list_with_rank = merge_sentiment_to_brand_words(product_map_list_with_rank,
all_product_sentiment_and_mentions)
# 删除ai中出现的品牌词
product_map_list_with_rank, all_product_list_with_rank = merge_com_and_ai_by_subset(product_map_list_with_rank,
all_product_list_with_rank)
# 获取产品词的分
product_map_list_with_rank = attach_favorability_brand_score(all_product_sentiment_and_mentions_score,
product_map_list_with_rank)
product_map_list_with_rank_result = rebuild_count_and_rank(product_map_list_with_rank)
# 获取所有品牌的分
all_product_list_with_rank = attach_favorability_brand_score(all_product_sentiment_and_mentions_score,
all_product_list_with_rank)
all_product_list_with_rank = merge_sentiment_to_brand_words(all_product_list_with_rank,
all_product_sentiment_and_mentions)
# ---------------------------------引用来源-----------------------------
path = f'geo/{taskId}/{platform}/quote.txt'
quto = tos_utils.get_string_from_tos(path)
snippet_list = []
quotes = []
if quto:
json_quto = json.loads(quto)
for q in json_quto:
t = {
"img": q.get("site_icon"),
"platform": q.get("platform"),
"sourceId": q.get("quto_id"),
"title": q.get("title"),
"url": q.get("url"),
"site_name": q.get("site_name")
}
quotes.append(t)
for q in json_quto:
k = {
"snippet": q.get("snippet"),
"sourceId": q.get("quto_id"),
}
snippet_list.append(k)
if quotes:
prompt = data["prompt"]
for item in quotes:
item["prompt"] = prompt
if snippet_list and brand_words and type != 'stream':
# if snippet_list and brand_words :
snippet_mention = mark_source_mentioned(snippet_list, brand_words)
quotes = merge_mentioned(snippet_mention, quotes)
sentiment_list = process_brand_sentiment_by_snippets(prompt, platform, taskId, reqId, brand_words, snippet_list)
quotes = merge_sentiment(sentiment_list, quotes)
# ---------------------------------引用来源-----------------------------
# ---------------------------------结果整合-----------------------------
result = {
"brandVos": brand_words_list_with_rank,
"competitorVos": com_word_list_with_rank,
"keywordsVos": keywords_list_with_rank,
"productVos": product_map_list_with_rank_result,
"productAllVos": all_product_list_with_rank,
"allVos": ai_word_list_with_rank,
"fullVos": [],
"minBrandRank": min_brand_rank,
"minCompetitorRank": min_com_rank,
"platform": platform,
"prompt": prompt,
"reqId": reqId,
"sources": quotes,
"taskId": taskId,
"totalBrandCount": brand_count_sum,
"totalCompetitorCount": com_count_sum,
"mentionsList": mentionsList
}
# 品牌情感倾向
if brand_words_list_with_rank:
sentiment_result = get_first_sentiment(brand_words_list_with_rank)
if sentiment_result:
result['sentiment'] = sentiment_result
score_result = get_first_score(brand_words_list_with_rank)
if score_result:
result['sentimentScore'] = score_result
# ---------------------------------结果整合-----------------------------
# ---------------------------------上传结果-----------------------------
tos_utils.put_string_to_tos(f'geo/{taskId}/{platform}/result.json', result)
commit_task(data, 'SUCCESS')
resp_cache_key = f"geo:task_check:resp:{reqId}"
resp = {
"code": 200,
"msg": "success",
"data": {
"status": 'ING',
"result": {}
}
}
_cache_set_json(resp_cache_key, resp, 1)
logger.success(f"{data['reqId']}--{data['platform']}--{data['prompt']}--ALL:SUCCESS")
# ---------------------------------上传结果-----------------------------
return True
class QueueType(Enum):
STREAM = (1, "stream")
STREAM_BATCH = (2, "stream_batch")
BATCH = (3, "batch")
SUCCESS = (4, "success")
def __init__(self, priority, queue_name):
self.priority = priority
self.queue_name = queue_name
@classmethod
def get_by_name(cls, name):
"""根据队列名称获取枚举实例"""
for queue in cls:
if queue.queue_name == name:
return queue
raise ValueError(f"不存在名为 {name} 的队列")
@classmethod
def get_next_priority_queue(cls, current_queue):
"""根据当前队列获取下一个优先级的队列(最后一个返回自身)"""
# 获取所有队列并按优先级排序(确保顺序正确)
sorted_queues = sorted(cls, key=lambda q: q.priority)
current_index = sorted_queues.index(current_queue)
# 下一个索引:如果是最后一个则保持当前索引,否则+1
next_index = current_index + 1 if current_index < len(sorted_queues) - 1 else current_index
return sorted_queues[next_index]
def handle_task_failure(task_type: str) -> str:
"""
接收队列类型名称,返回下一个优先级的队列名称
:param task_type: 原始队列名称(如 "stream"、"batch" 等)
:return: 下一个优先级的队列名称
"""
try:
# 根据名称获取原始队列枚举实例
original_queue = QueueType.get_by_name(task_type)
except ValueError:
raise ValueError(f"不支持的任务类型: {task_type}")
# 获取下一个优先级的队列
next_queue = QueueType.get_next_priority_queue(original_queue)
return next_queue.queue_name
def platform_process(data):
PLATFORM_PROCESS_MAP = {
'DB': doubao_data_process.doubao_process_original_data,
'DP': deepseek_data_process.deepseek_process_original_data,
'TXYB': yuanbao_data_process.yuanbao_process_original_data,
'KIMI': kimi_data_process.kimi_process_original_data,
'WXYY': wenxin_data_process.wenxin_process_original_data,
'TYQW': qianwen_data_process.qianwen_process_original_data,
'BDAI': baiduai_data_process.baiduai_process_original_data,
'DPA': deepseek_android_data_process.deepseek_android_process_original_data,
'DOUBA': doubao_android_data_process.doubao_mobile_process_original_data,
'DYAI': douyinai_data_process.douyin_ai_process_original_data,
'TYQWA': qianwen_android_data_process.qianwen_android_process_original_data,
'TXYBA': yuanbao_android_data_process.yuanbao_android_process_original_data,
'XHSA': xiaohongshu_android_data_process.xiaohongshu_android_process_original_data,
}
try:
commit_task(data, 'PROCESSING')
platform = data["platform"]
original_path = f'geo/{data["taskId"]}/{platform}/original.text'
context_path = f'geo/{data["taskId"]}/{platform}/context.txt'
response_content = None
if check_file_in_tos(context_path):
response_content = tos_utils.get_string_from_tos(context_path)
else:
process_func = PLATFORM_PROCESS_MAP.get(platform)
if process_func:
_, _, _, _, response_content, _ = process_func(original_path)
if response_content:
result_v2(response_content, data)
else:
scheduler(data)
return True
except Exception as e:
logger.exception(
f"{data['reqId']}--{data['platform']}--{data['prompt']}--PROCESS_FAIL"
)
logger.error(f"{data['reqId']}--{data['platform']}--{data['prompt']}--{e}--PROCESS_FAIL", exc_info=True)
robot_utils.feishu_tobot(f"{data['reqId']}--{data['platform']}--{data['prompt']}--{e}--PROCESS_FAIL")
commit_task(data, 'PROCESS_FAIL')
scheduler(data)
return False
def main_process(data):
task_id = data.get('taskId')
platform = data.get('platform')
type = data.get('type')
req_id = data.get('reqId')
prompt = data.get('prompt')
data["prompt"] = prompt.strip()
check = bh_utils.query_data(
f"select status from geo_commit_task where taskId ='{task_id}' and platform = '{platform}'")
if any(item.get("status") == "SUCCESS" for item in (check or [])):
platform_process(data)
else:
if type == 'stream':
logger.success(f"{req_id}--{platform}--{prompt}--{type}")
bool_result = spider_interface.get_platform_response(data)
if bool_result:
size = get_tos_file_size(f"geo/{task_id}/{platform}/original.text")
if size is None or size < 1024:
scheduler(data)
else:
logger.success(f"{req_id}--{platform}--{prompt}--ING_IN_TOS")
platform_process(data)
else:
logger.error(f"{req_id}--{platform}--{prompt}--400")
scheduler(data)
elif type in ('stream_batch', 'batch') and not check:
task_send_queue(data, type)
def process_success(data):
data['type'] = 'success'
platform = data.get('platform')
if platform == 'DB':
text, quto = ai_interface.get_doubao_message(data.get('prompt'))
if quto:
quto_list = []
for index, item in enumerate(quto):
raw_data = {
"url": item.get('url'),
"title": item.get('title'),
"snippet": item.get('summary'),
"index": index,
"published_at": item.get('publish_time'),
"site_name": item.get('site_name'),
"site_icon": item.get('logo_url'),
}
quto_list.append(raw_data)
urls = []
for c in quto_list:
c['task_id'] = data.get('taskId')
c['platform'] = platform
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
urls.append(u)
else:
urls.append(c.get('title'))
url_ids_map = url_utils.generate_numeric_url_id(urls)
for c in quto_list:
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
key = u
else:
key = c.get('title')
c['quto_id'] = url_ids_map.get(key)
bh_utils.insert_data("geo_quote_result_v2", quto_list)
put_string_to_tos(f"geo/{data.get('taskId')}/{data.get('platform')}/quote.txt", quto_list)
# elif platform == 'KIMI':
# text = ai_interface.get_kimi_message(data.get('prompt'))
# elif platform == 'DP':
# text = ai_interface.get_deepseek_message(data.get('prompt'))
elif platform == 'TYQW':
text = ai_interface.get_qianwewn_message(data.get('prompt'))
else:
text, quto = ai_interface.get_doubao_message(data.get('prompt'))
put_string_to_tos(f"geo/{data.get('taskId')}/{data.get('platform')}/context.txt", text)
result_v2(text, data)
def process_call_back(task_data, result):
task_id = task_data.get('taskId')
platform = task_data.get('platform')
req_id = task_data.get('reqId')
spider_data = result.get('data')
if spider_data == 400:
scheduler(task_data)
else:
logger.success(f"{req_id}--{platform}-------CALL_BACK_START")
put_string_to_tos(f'geo/{task_id}/{platform}/original.text', spider_data)
req_check = bh_utils.query_data(f"select status from geo_commit_task where reqId ='{req_id}'")
req_status = ""
if req_check and len(req_check) > 0:
req_status = req_check[0].get("status", "")
if req_check and req_status != 'SUCCESS':
size = get_tos_file_size(f"geo/{task_id}/{platform}/original.text")
if size is None or size < 1024:
scheduler(task_data)
else:
platform_process(task_data)
def scheduler(data):
commit_task(data, 'ING')
task_id = data.get('taskId')
platform = data.get('platform')
req_id = data.get('reqId')
fail_queue_type = data.get('type')
prompt = data.get('prompt')
degrade_count_key = f"geo:degrade_count:{task_id}:{platform}"
degrade_count = redis_client.incr(degrade_count_key)
if degrade_count == 1:
redis_client.expire(degrade_count_key, 7200)
if degrade_count > 6:
logger.success(f"{data['reqId']}--{data['platform']}--{data['prompt']}--降级次数超过6次,直接走success")
process_success(data)
return
if fail_queue_type == 'batch':
batch_retry_key = f"geo:batch_retry_count:{task_id}:{platform}"
batch_retry_count = redis_client.incr(batch_retry_key)
if batch_retry_count == 1:
redis_client.expire(batch_retry_key, 7200)
if batch_retry_count <= 1:
data['type'] = 'batch'
key = f"{platform}:geo:batch:list"
logger.success(
f"{req_id}--{platform}--{prompt}--batch失败,重新投递batch--第{batch_retry_count}次batch重试"
)
redis_client.lpush(key, json.dumps(data))
return
logger.success(
f"{req_id}--{platform}--{prompt}--batch已重试{batch_retry_count - 1}次,直接走success"
)
process_success(data)
return
next_queue_type = handle_task_failure(fail_queue_type)
logger.success(
f"{data['reqId']}--{data['platform']}--{data['prompt']}--降级成功-当前{next_queue_type}--第{degrade_count}次降级")
if next_queue_type == 'success':
process_success(data)
else:
data['type'] = next_queue_type
key = f"{platform}:geo:{next_queue_type}:list"
redis_client.lpush(key, json.dumps(data))
def task_send_queue(data, queue):
logger.success(f"{data['reqId']}--{data['platform']}--{data['prompt']}--任务提交--{queue}")
if data.get('thinkingEnabled'):
data["thinking_enabled"] = str(data.get('thinkingEnabled'))
if data.get('searchEnabled'):
data["search_enabled"] = str(data.get('searchEnabled'))
data["search_enabled"] = "1"
commit_task(data, 'ING')
redis_client.lpush(f"{data['platform']}:geo:{queue}:list", json.dumps(data))
if __name__ == '__main__':
# data = {
# "prompt": "商家出餐但无骑手接单怎么办?",
# "taskId": "aa609471-94f3-4f36-ae0c-9d5e6377e929",
# "brandWords":["美团APP", "美团"],
# "comWords":["高德本地生活", "京东本地生活", "快手本地生活", "抖音生活服务", "饿了么"],
# "reqId": "45d69530-862b-4a59-b145-355d9c2b1003",
# "platform": "DPA",
# "type": "batch",
# "searchEnabled": 1,
# "thinkingEnabled": 0,
# "comWordsMap":[{"brand": "饿了么", "keywords": ["饿了么"]}, {"brand": "抖音生活服务", "keywords": ["抖音生活服务"]}, {"brand": "高德本地生活", "keywords": ["高德本地生活"]}, {"brand": "京东本地生活", "keywords": ["京东本地生活"]}, {"brand": "快手本地生活", "keywords": ["快手本地生活"]}]
# }
# platform_process(data)
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
#
# begin = '2026-05-18'
# end = '2026-05-20'
#
#
# req_list = get_req_id(15100000026,begin,end)
# req_ids = []
# for item in req_list:
# req_id = item.get("req_id")
# req_ids.append(req_id)
#
# req_id_sql = ",".join([f"'{req_id}'" for req_id in req_ids])
# platform = ['DOUBA']
# platform_list = ",".join([f"'{p}'" for p in platform])
# query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql}) and platform in ({platform_list}) and type ='success'"
# data_list = bh_utils.query_data(query_sql)
# print(len(data_list))
data_list = bh_utils.query_data("select * from geo_commit_task where status = 'ING' and type in ('batch','stream_batch') ")
#
# data_list = bh_utils.query_data(f"select * from geo_commit_task where reqId in ('0bf417a5-79f9-47f8-8f62-41ac63fe1a83','4deb09b0-de4b-4067-9bd3-b34cd29a089e','fd11816c-2255-4d15-ab78-5966effb4651','4deb09b0-de4b-4067-9bd3-b34cd29a089e','39e2b623-ee9f-4057-8a8e-6673d411226e','4deb09b0-de4b-4067-9bd3-b34cd29a089e','f81511cc-1c63-438c-99ce-b603a2340c06','4ce38505-257f-4bc5-a0ef-0b37615ec3a0','31a7119d-97b0-44f9-adb4-b67c957e1ec6','ef9b7c91-a60f-4afd-916f-fe8a5cd6c42d','6a4c7971-f455-427b-9fce-054066feecbf','bfeccabb-3a54-42cf-aacc-e481a486816e','e5decb39-f9ab-4eed-9605-83dc847f9b44','0bf417a5-79f9-47f8-8f62-41ac63fe1a83','fd11816c-2255-4d15-ab78-5966effb4651','ba90889e-db64-4df3-bf42-734bb28f02d6','9c45a2e2-b07c-44f7-a72a-05f338169f79','ecbb8c0c-4857-44ea-9703-5813dd6e4685','23543fee-ee44-48fa-8e23-d5a8657ce651','7927c636-d948-46d2-a7b0-4569ae8b5617','bfeccabb-3a54-42cf-aacc-e481a486816e','39e2b623-ee9f-4057-8a8e-6673d411226e','c068a9d4-277a-4428-9195-8174d1a1ff37','bfeccabb-3a54-42cf-aacc-e481a486816e','1ea51e2d-b069-4713-a5b5-bc75e7e0d6dc','e5decb39-f9ab-4eed-9605-83dc847f9b44','ba90889e-db64-4df3-bf42-734bb28f02d6','eb1d4753-e585-4ba0-ae95-28e938afe6c8','5d18240f-143d-4a97-b687-d98e0e2fdc20','fd7294a4-2c9c-4e92-91d7-ff6b5477ba2c','5495ee25-96fb-4ab7-a576-a43601aa1fb7','23543fee-ee44-48fa-8e23-d5a8657ce651')")
#
# # data_list = bh_utils.query_data(f"-- select * from geo_commit_task where reqId = 'bb32742f-c353-4a80-8855-ee097e4e0c91'")
# #
# #
# # # #
def handle_item(i):
print(i.get('reqId'))
if i.get('comWordsMap'):
i['comWordsMap'] = json.loads(i.get('comWordsMap'))
if i.get('brandWords'):
i['brandWords'] = json.loads(i.get('brandWords'))
if i.get('comWords'):
i['comWords'] = json.loads(i.get('comWords'))
if i.get('keywords'):
i['keywords'] = json.loads(i.get('keywords'))
if i.get('productWordsMap'):
i['productWordsMap'] = json.loads(i.get('productWordsMap'))
type_t = i.get('type')
return task_send_queue(i,type_t)
if data_list:
with ThreadPoolExecutor(max_workers=30) as executor:
futures = [executor.submit(handle_item, i) for i in data_list]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
logger.exception(f"platform_process 执行异常: {e}")
import json
import traceback
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def qianwen_android_process_original_data(file_path):
url_list = []
url_list_batch = []
think_content = ""
response_content = ""
search_keyword = []
rich_media_block = []
suggestions = []
is_think = False
think_bool = False
response_bool = False
result = []
try:
original_content = get_string_from_tos(file_path)
content_list = original_content.split("\n")
for i in content_list:
if i.startswith("data:"):
try:
# 提取并解析JSON数据
data_str = i.split("data:")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue
data = json_data.get('data')
if data:
messages = data.get('messages')
if isinstance(messages, list):
if len(messages) > 0:
mime_type = messages[0].get('mime_type')
status = messages[0].get('status')
type = messages[0].get('type')
content = messages[0].get('content')
meta_data = messages[0].get('meta_data')
meta_data_type = ""
multi_load = ""
multi_load_type = ""
multi_load_content = ""
multi_load_status = ""
if meta_data:
meta_data_type = messages[0].get('meta_data').get('type')
multi_load = messages[0].get('meta_data').get('multi_load')
if multi_load:
if isinstance(multi_load, list):
multi_load_type = multi_load[0].get('type')
multi_load_content = multi_load[0].get('content')
if multi_load_content:
multi_load_status = multi_load[0].get('content').get('status')
# 搜索前
if mime_type == 'plan_cot/post' and status == 'complete':
think_content += content
# 搜索词
if mime_type == 'bar/progress' and meta_data_type == 'cot':
if messages[0].get('meta_data').get('content'):
search_list = messages[0].get('meta_data').get('content').get('list')
if isinstance(search_list, list):
for search in search_list:
search_keyword.append(search.get('query'))
# 引用来源-无下标
if mime_type == 'bar/iframe' and status == 'complete':
if meta_data:
sources = meta_data.get('sources')
for sou in sources:
url_list.append({
"url": [sou],
"source_seq": '',
})
taobao_list = []
if mime_type == 'multi_load/iframe' and multi_load_type == 'taoassistant_fold_product_feeds' and status == 'complete':
for mu in multi_load:
if mu.get('type') == 'taoassistant_fold_product_feeds':
source_seq = mu.get('source_seq')
jump_url = next(
(
item.get('params', {}).get('popupLink', '')
for item in
mu.get('content', {}).get('cardData', {}).get('clientActions', [])
if item.get('type') == 'popupLink'
),
''
)
query = next(
(
item.get('params', {}).get('params', {}).get('queryList',[])
for item in
mu.get('content', {}).get('cardData', {}).get('clientActions', [])
if item.get('type') == 'loadmore'
),
''
)
if isinstance(query,str):
query = json.loads(query)
if isinstance(query,list):
search_keyword.extend(query)
if isinstance(mu.get('content').get('cardData').get('data').get('items'), list):
for pro in mu.get('content').get('cardData').get('data').get('items'):
price = (pro.get('priceShowWithIcon') or {}).get('price') or pro.get('itemPrice') or ''
taobao_list.append({
"title": pro.get('title'),
"shop_name": next(
(
item.get('text', '')
for item in
pro.get('structuredShopInfo', {}).get('infoList', [])
if item.get('sourceType') == 'shop_name'
),
''
),
"pic_path": pro.get('pic_path', ''),
"price": price,
"jump_url": jump_url,
"source_seq": source_seq,
"auctionURL": pro.get('auctionURL'),
})
if taobao_list:
response_content += 'render_ecom_card_widget_taobao_start:'
damai_str = json.dumps(taobao_list, ensure_ascii=False)
response_content += damai_str
response_content += 'render_ecom_card_widget_taobao_end:\n'
damai_list = []
if mime_type == 'multi_load/iframe' and multi_load_type == 'damai_shows_list' and multi_load_status == 'complete':
for mu in multi_load:
if mu.get('type') == 'damai_shows_list':
source_seq = mu.get('source_seq')
if isinstance(mu.get('content').get('list'), list):
for damai in mu.get('content').get('list'):
venueName = damai.get('data').get('venueName', '')
cityName = damai.get('data').get('cityName', '')
damai_list.append({
'name': damai.get('data').get('name', ''),
'venueName': venueName,
'showTime': damai.get('data').get('showTime', ''),
"cityName": cityName,
'verticalPic': damai.get('data').get('verticalPic', ''),
'priceShowText': damai.get('data').get('priceShowText', ''),
'webURL': damai.get('jumpConfig').get('detailUrl').get('webURL',
'') or '',
'priceLow': damai.get('data').get('priceLow', ''),
'minPrice': damai.get('data').get('minPrice', ''),
'maxPrice': damai.get('data').get('maxPrice', ''),
'priceStr': damai.get('data').get('priceStr', ''),
'showVenueName': cityName + venueName,
'source_seq_id': source_seq
})
if damai_list:
response_content += 'render_ecom_card_widget_damai_start:'
damai_str = json.dumps(damai_list, ensure_ascii=False)
response_content += damai_str
response_content += 'render_ecom_card_widget_damai_end:\n'
if mime_type == 'multi_load/iframe' and multi_load_type == 'deep_think' and multi_load_status == 'complete':
think_content += multi_load_content.get('think_content')
# 引用来源 回答 视频列表
if mime_type == 'multi_load/iframe' and status == 'complete':
response_content += content
for mu in multi_load:
if mu.get('type') == 'video_note_list':
rich_media_block.append(
{
"url": mu.get('content').get('list'),
"source_seq": mu.get('source_seq')
}
)
if mu.get('type') == 'source_group_web':
url_list_batch.append(
{
"url": mu.get('content').get('list'),
"source_seq": mu.get('source_seq')
}
)
if mime_type == 'paa/iframe' and status == 'complete':
paas = meta_data.get('paas')
if isinstance(paas, list):
for pa in paas:
suggestions.append(pa.get('show_text'))
if url_list_batch:
url_list = url_list_batch
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions, rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
# traceback.print_exc()
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content, suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions, rich_media_block)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path2 = 'geo/7e3eb418-83ee-44ca-b6cf-ea4c3af9aae2/TYQWA/original.text'
# file_path2 = 'geo/ef586bf6-55d8-4e2a-8fb0-5fd1beb5bf7c/TYQWA/original.text'
qianwen_android_process_original_data(file_path2)
import json
import traceback
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
from aidso_geo.models import spider_save_tos
def qianwen_process_original_data(file_path):
url_list = []
url_list_batch = []
think_content = ""
response_content = ""
search_keyword = []
suggestions = []
is_think = False
think_bool = False
response_bool = False
result = []
try:
original_content = get_string_from_tos(file_path)
content_list = original_content.split("\n")
for i in content_list:
if i.startswith("data:"):
try:
# 提取并解析JSON数据
data_str = i.split("data:")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue
#
if isinstance(json_data, dict):
if json_data.get('msgStatus'):
if json_data.get("incremental") == False:
if json_data.get('msgStatus') == 'finished':
if json_data.get('contents'):
for cn in json_data.get('contents'):
if cn.get('contentType') == 'plugin':
pluginResult = json.loads(cn.get('content')).get('pluginResult')
if pluginResult:
try:
if isinstance(json.loads(pluginResult), list):
if len(json.loads(pluginResult))>=2:
url_list = json.loads(pluginResult)[1].get(
'search_results')
if isinstance(json.loads(pluginResult), dict):
url_list = json.loads(pluginResult).get(
'links')
except Exception as e:
...
if cn.get('contentType') == 'think':
think_content = json.loads(cn.get('content')).get('content')
if cn.get('contentType') == 'text':
response_content = cn.get('content')
if json_data.get("data"):
if json_data.get("data").get('status'):
if json_data.get("data").get('status') == 'complete':
messages = json_data.get('data').get('messages')
for cn in messages:
if cn.get('mime_type') == 'bar/iframe' and cn.get('status') == 'complete':
url_list_batch =cn.get('meta_data').get('sources')[0].get('content').get('list')
if cn.get('mime_type') == 'multi_load/iframe' and cn.get('status') == 'complete':
response_content = cn.get('content')
response_content = response_content.replace("[(deep_think)]", "")
multi_load = cn.get('meta_data').get('multi_load')
if multi_load:
for mu in multi_load:
if mu.get('type') == 'deep_think':
if mu.get('content').get('status') =='complete':
think_content = mu.get('content').get('think_content')
if json_data.get("data").get('messages'):
messages = json_data.get("data").get('messages')
for ms in messages:
if ms.get('mime_type') =='multi_load/iframe' and ms.get('status') == 'complete':
response_content = ms.get('content')
response_content = response_content.replace("[(deep_think)]", "")
multi_load = ms.get('meta_data').get('multi_load')
if multi_load:
for mu in multi_load:
if mu.get('type') == 'deep_think':
if mu.get('content').get('status') == 'complete':
think_content = mu.get('content').get('think_content')
if ms.get('mime_type') =='bar/iframe' and ms.get('status') == 'complete':
url_list_batch = ms.get('meta_data').get('sources')[0].get('content').get('list')
if ms.get('mime_type') =='paa/iframe' and ms.get('status') == 'complete':
paas = ms.get('meta_data').get('paas')
if paas:
for pa in ms.get('meta_data').get('paas'):
suggestions.append(pa.get('show_text'))
if url_list_batch:
for url in url_list_batch:
url_list.append(
{
"url": url.get('url', ''),
"title": url.get('title', ''),
"snippet": url.get('summary', ''),
"host_name": url.get('name', ''),
"host_logo": url.get('icon', ''),
"time": url.get('publish_time', ''),
}
)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path2 = 'geo/a5136478-a267-4824-84a9-08a57e1290ec/TYQW/original.text'
# file_path2 = 'geo/jqk/TYQW/2.txt'
# /geo/900f0ea6e9a34c95b6b57aa7519a4820/
# file_path3 = 'geo/51a7ee04-711c-4cf0-9d4c-4b523fba7037/TYQW/original.text'
qianwen_process_original_data(file_path2)
# for i in task_id:
# plan_cot/post and status =='complete' 思考过程前
# bar/progress and type = cot 搜索关键词
# bar/progress and type = sources 引用来源
# multi_load/iframe 相关视频
# qianwen_process_original_data(f"geo/{i}/TYQW/original.text")
import json
import os
from aidso_geo.utils.tos_utils import put_string_to_tos
from aidso_geo.utils import bh_utils
from aidso_geo.utils import url_utils
def deepseek_process_quote(url_list):
quto_list = []
for item in url_list:
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', ''),
"snippet": item.get('snippet', ''),
"index": item.get('cite_index', ''),
"published_at": item.get('published_at', ''),
"site_name": item.get('site_name', ''),
"site_icon": item.get('site_icon', ''),
}
quto_list.append(raw_data)
return quto_list
def deepseek_android_process_quote(url_list):
quto_list = []
for item in url_list:
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', ''),
"snippet": item.get('snippet', ''),
"index": item.get('cite_index', ''),
"published_at": item.get('published_at', ''),
"site_name": item.get('site_name', ''),
"site_icon": item.get('site_icon', ''),
}
quto_list.append(raw_data)
return quto_list
def doubao_android_process_quote(url_list):
quto_list = []
for item in url_list:
raw_data = {
"url": item.get('url', '') or item.get('main_site_url',''),
"title": item.get('title', '') or item.get('video_captions',''),
"snippet": item.get('summary', '') or item.get('video_captions',''),
"index": item.get('index', ''),
"published_at": item.get('publish_time_second', ''),
"site_name": item.get('sitename', '') or item.get('source_app_name', ''),
"site_icon": item.get('logo_url', '') or item.get('source_app_icon', ''),
}
quto_list.append(raw_data)
return quto_list
def doubao_android_process_rich_media(url_list):
quto_list = []
for item in url_list:
video_id = item.get('item_id', '')
raw_data = {
"video_id": video_id,
"video_captions": item.get('video_captions', ''),
"video_first_frame_image": item.get('video_first_frame_image', '') or item.get('cover').get('origin_cover').get('image_ori').get('url') or '',
"main_site_url": item.get('main_site_url', '') or f'https://www.douyin.com/video/{video_id}',
"source_app_name": item.get('source_app_name', '') or '抖音'
}
quto_list.append(raw_data)
return quto_list
def doubao_process_rich_media(url_list):
quto_list = []
for item in url_list:
raw_data = {
"video_id": item.get('item_id', ''),
"video_captions": item.get('video_captions', ''),
"video_first_frame_image": item.get('video_first_frame_image', ''),
"main_site_url": item.get('main_site_url', ''),
"source_app_name": item.get('source_app_name', '')
}
quto_list.append(raw_data)
return quto_list
def douyinai_process_rich_media(url_list):
quto_list = []
for item in url_list:
raw_data = {
"video_id": item.get('id', ''),
"video_captions": item.get('title', ''),
"video_first_frame_image": item.get('url_struct').get("url_list")[0],
"main_site_url": "https://m.douyin.com/share/video/"+item.get('id', ''),
"source_app_name": item.get('source_app_name', '抖音')
}
quto_list.append(raw_data)
return quto_list
def qianwen_android_process_rich_media(url_list):
quto_list = []
for item in url_list:
source_seq= item.get('source_seq', '')
if isinstance(item.get('url'),list):
for url in item.get('url'):
raw_data = {
"video_id": url.get('zhidaye_id', ''),
"video_captions": url.get('title', ''),
"video_first_frame_image": url.get('cover',''),
"main_site_url": url.get('norm_url', '') or url.get('url', ''),
"author": url.get('author', ''),
"publishTime": url.get('publishTime', ''),
"duration": url.get('duration', ''),
"source_seq": source_seq
}
quto_list.append(raw_data)
return quto_list
def baidu_ai_rich_media(url_list):
quto_list = []
for item in url_list:
raw_data = {
"video_id": item.get('nid', ''),
"video_captions": item.get('title', ''),
"video_first_frame_image": item.get('src', '') or item.get('thumbnail').get('src'),
"main_site_url": item.get('linkInfo').get('originUrl') or item.get('linkInfo').get('href') or '',
"source_app_name": '全民小视频'
}
quto_list.append(raw_data)
return quto_list
def doubao_process_quote(url_list):
quto_list = []
for item in url_list:
raw_data = {
"url": item.get('text_card').get('url', ''),
"title": item.get('text_card').get('title', ''),
"snippet": item.get('text_card').get('summary', ''),
"index": item.get('text_card').get('index', ''),
"published_at": item.get('text_card').get('publish_time_second', ''),
"site_name": item.get('text_card').get('sitename', ''),
"site_icon": item.get('text_card').get('logo_url', ''),
}
quto_list.append(raw_data)
return quto_list
def baiduai_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
raw_data = {
"url": item.get('url', ''),
"title": item.get('text', ''),
"snippet": item.get('abstract', ''),
"index": index,
"published_at": item.get('published_at', ''),
"site_name": item.get('source', ''),
"site_icon": item.get('icon', ''),
}
quto_list.append(raw_data)
return quto_list
def douyinai_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
site_name = item.get("site_name", "")
aweme_id = item.get("aweme_id", "")
default_url = item.get("url", "")
if site_name == "抖音视频" and aweme_id:
final_url = f"https://www.douyin.com/video/{aweme_id}"
else:
final_url = default_url
raw_data = {
"url": final_url ,
"title": item.get('title', ''),
"snippet": item.get('abstract', ''),
"index": index,
"published_at": item.get('published_at', ''),
"site_name": site_name,
"site_icon": item.get('logo_url', ''),
}
quto_list.append(raw_data)
return quto_list
def qianwen_android_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
source_seq= item.get('source_seq', '')
for url in item.get('url'):
content = url.get('content')
if content:
if isinstance(content.get('list'),list):
for c in content.get('list'):
raw_data = {
"url": c.get('url', ''),
"title": c.get('title', ''),
"snippet": c.get('summary', ''),
"index": index,
"published_at": c.get('publish_time', ''),
"site_name": c.get('name', ''),
"site_icon": c.get('icon', ''),
"source_seq": source_seq or url.get('source_seq'),
}
quto_list.append(raw_data)
return quto_list
def yuanbao_process_quote(url_list):
quto_list = []
for item in url_list:
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', ''),
"snippet": item.get('quote', ''),
"index": item.get('index', ''),
"published_at": item.get('publish_time', ''),
"site_name": item.get('web_site_name', ''),
"site_icon": item.get('icon_url', ''),
}
quto_list.append(raw_data)
return quto_list
def yuanbao_android_process_quote(url_list):
quto_list = []
for item in url_list:
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', ''),
"snippet": item.get('quote', ''),
"index": item.get('index', ''),
"published_at": item.get('publish_time', ''),
"site_name": item.get('web_site_name', ''),
"site_icon": item.get('icon_url', ''),
}
quto_list.append(raw_data)
return quto_list
def xiaohongshu_android_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
url = ""
id = item.get('id')
if id:
url = "https://www.xiaohongshu.com/explore/"+id
raw_data = {
"url": url,
"title": item.get('title', ''),
"snippet": item.get('content', ''),
"index": index,
"published_at": item.get('time', ''),
"site_name": '小红书',
"site_icon": 'www.xiaohongshu.com',
}
quto_list.append(raw_data)
return quto_list
def qianwen_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', ''),
"snippet": item.get('body', ''),
"index": index,
"published_at": item.get('time', ''),
"site_name": item.get('host_name', ''),
"site_icon": item.get('host_logo', ''),
}
quto_list.append(raw_data)
return quto_list
def kimi_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', '') or item.get('site_name', ''),
"snippet": item.get('snippet', ''),
"index": item.get('refIndex') or item.get('id',index),
"published_at": item.get('publishTime', ''),
"site_name": item.get('siteName', ''),
"site_icon": item.get('iconurl', ''),
}
quto_list.append(raw_data)
return quto_list
def wenxin_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', 'PDF'),
"snippet": item.get('siteAbstract', '') or item.get('wild_abstract', ''),
"index": item.get('index', '') or index,
"published_at": item.get('publishTime', '') or item.get('date', ''),
"site_name": item.get('name', '') or item.get('site', ''),
"site_icon": item.get('icon', '') or item.get('site_icon', ''),
}
quto_list.append(raw_data)
return quto_list
def get_target_dir(full_path):
dir_part = os.path.dirname(full_path)
target_dir = dir_part + os.sep
return target_dir
def save_data_to_tos(target_dir, content, file_name):
if not content:
return
platform = target_dir.split('/')[2]
if file_name == "rich_media_block.txt":
if platform == 'DOUBA':
content = doubao_android_process_rich_media(content)
if platform == 'DB':
content = doubao_process_rich_media(content)
if platform == 'DYAI':
content = douyinai_process_rich_media(content)
if platform =='TYQWA':
content = qianwen_android_process_rich_media(content)
if platform =='BDAI':
content = baidu_ai_rich_media(content)
if file_name == "quote.txt":
if platform == 'DB':
content = doubao_process_quote(content)
elif platform == 'DP':
content = deepseek_process_quote(content)
elif platform == 'TXYB':
content = yuanbao_process_quote(content)
elif platform == 'TYQW':
content = qianwen_process_quote(content)
elif platform == 'KIMI':
content = kimi_process_quote(content)
elif platform == 'WXYY':
content = wenxin_process_quote(content)
elif platform == 'BDAI':
content = baiduai_process_quote(content)
elif platform == 'DPA':
content = deepseek_android_process_quote(content)
elif platform == 'DOUBA':
content = doubao_android_process_quote(content)
elif platform == 'DYAI':
content = douyinai_process_quote(content)
elif platform == 'TYQWA':
content = qianwen_android_process_quote(content)
elif platform == 'TXYBA':
content = yuanbao_android_process_quote(content)
elif platform == 'XHSA':
content = xiaohongshu_android_process_quote(content)
else:
content = doubao_process_quote(content)
task_id = target_dir.split('/')[1]
urls = []
for c in content:
c['task_id'] = task_id
c['platform'] = platform
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
urls.append(u)
else:
urls.append(c.get('title'))
url_ids_map = url_utils.generate_numeric_url_id(urls)
for c in content:
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
key = u
else:
key = c.get('title')
c['quto_id'] = url_ids_map.get(key)
bh_utils.insert_data("geo_quote_result_v2", content)
full_file_path = f"{target_dir}{file_name}"
put_string_to_tos(full_file_path, content)
def process_and_save_files(file_path, search_keyword, url_list, think_content, response_content, suggestions,rich_media_block=None):
target_dir = get_target_dir(file_path)
data_config = [
(search_keyword, "search_keyword.txt"), # 搜索关键词 → search_keyword.txt
(url_list, "quote.txt"), # URL列表 → quote.txt
(think_content, "think.txt"), # 思考内容 → think.txt
(response_content, "context.txt"), # 响应内容 → context.txt
(suggestions, "suggestions.txt") # 响应内容 → context.txt
]
if rich_media_block is not None:
data_config.append((rich_media_block, "rich_media_block.txt"))
for content, file_name in data_config:
save_data_to_tos(target_dir, content, file_name)
def ai_process_quote(url_list):
quto_list = []
for index, item in enumerate(url_list):
raw_data = {
"url": item.get('url', ''),
"title": item.get('title', ''),
"snippet": item.get('snippet', ''),
"index": item.get('index', '') or index,
"published_at": item.get('published_at',''),
"site_name": item.get('site_name', ''),
"site_icon": item.get('site_icon', ''),
}
quto_list.append(raw_data)
return quto_list
def save_data_to_tos_ai(target_dir, content, file_name):
if not content:
return
platform = target_dir.split('/')[2]
if file_name == "quote.txt":
content = ai_process_quote(content)
task_id = target_dir.split('/')[1]
urls = []
for c in content:
c['task_id'] = task_id
c['platform'] = platform
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
urls.append(u)
else:
urls.append(c.get('title'))
url_ids_map = url_utils.generate_numeric_url_id(urls)
for c in content:
u = c.get('url')
if isinstance(u, str) and u.startswith('http'):
key = u
else:
key = c.get('title')
c['quto_id'] = url_ids_map.get(key)
bh_utils.insert_data("geo_quote_result_v2", content)
full_file_path = f"{target_dir}{file_name}"
put_string_to_tos(full_file_path, content)
def process_and_save_files_ai(file_path, search_keyword, url_list, think_content, response_content, suggestions,rich_media_block=None):
target_dir = get_target_dir(file_path)
data_config = [
(search_keyword, "search_keyword.txt"), # 搜索关键词 → search_keyword.txt
(url_list, "quote.txt"), # URL列表 → quote.txt
(think_content, "think.txt"), # 思考内容 → think.txt
(response_content, "context.txt"), # 响应内容 → context.txt
(suggestions, "suggestions.txt") # 响应内容 → context.txt
]
for content, file_name in data_config:
save_data_to_tos_ai(target_dir, content, file_name)
import json
import traceback
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
from aidso_geo.models import spider_save_tos
def wenxin_process_original_data(file_path):
url_list = ""
think_content = ""
response_content = ""
search_keyword = []
suggestions = []
is_think = False
think_bool = False
response_bool = False
try:
original_content = get_string_from_tos(file_path)
content_list = original_content.split("\n\n")
for i in content_list:
data = i.split("\n")
event =data[0].split('event:')
if len(event)>=2:
if event[1]=='thought':
think_content+=json.loads(data[1].split('data:')[1]).get('thoughts')
if event[1]=='step' :
if 'url' in data[1].split('data:')[1]:
url_list = json.loads(data[1].split('data:')[1]).get('contents')
if event[1] == 'state':
if json.loads(data[1].split('data:')[1]).get('task_status') == 'SUCCESS':
if json.loads(data[1].split('data:')[1]).get('searchCitations'):
url_list = json.loads(data[1].split('data:')[1]).get('searchCitations').get("list")
if event[1]=='message':
if data[1].split('data:')[1]:
if json.loads(data[1].split('data:')[1]).get('data').get('content'):
response_content+=json.loads(data[1].split('data:')[1]).get('data').get('content')
if event[1] == 'quesRecommend':
for i in json.loads(data[1].split('data:')[1]).get('data'):
suggestions.append(i.get('guideQuery'))
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path = 'geo/62069ec4-c957-46b6-940a-230ad5b78a9b/WXYY/original.text'
wenxin_process_original_data(file_path)
import json
import traceback
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def xiaohongshu_android_process_original_data(file_path):
"""处理文件内容,提取思考内容、响应内容和URL列表"""
url_list =[]
think_content = ""
response_content = ""
response_process = ""
search_keyword = []
is_think = False
think_bool = False
suggestions = []
response_bool = False
try:
original_content = get_string_from_tos(file_path)
json_content = json.loads(original_content)
base_info = json_content.get('base_info')
share_info = json_content.get('share_info')
elements = json_content.get('elements')
items = json_content.get('items')
response_content = base_info.get('text')
url_list = items
if "</complex_list>" in response_content:
response_content = response_content.replace('</complex_list>','')
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
# parts = file_path.split('/')
# platform = parts[2]
# task_id = parts[1]
# context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
# if context:
# response_content = context
# url_list = quote
# suggestions = suggestion
# think_content = think
# search_keyword = search_word
# spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
# response_content,suggestions)
#
# return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
# else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path2 = 'geo/XHSA-XHSA-01/XHSA/original.text'
xiaohongshu_android_process_original_data(file_path2)
import json
from aidso_geo.models import spider_save_tos
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
def yuanbao_android_process_original_data(file_path):
"""处理文件内容,提取思考内容、响应内容和URL列表"""
url_list =[]
think_content = ""
response_content = ""
response_process = ""
search_keyword = []
is_think = False
think_bool = False
suggestions = []
response_bool = False
try:
original_content = get_string_from_tos(file_path)
# # 按空行分割内容,过滤空字符串
content_list = original_content.split("\n\n")
for i in content_list:
if i.startswith("data: "):
try:
# 提取并解析JSON数据
data_str = i.split("data: ")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue # 跳过格式错误的数据
if json_data.get('type') == 'searchGuid':
url_list = json_data.get('docs')
if json_data.get('type') == 'think':
think_content += json_data.get('content')
if json_data.get('type') == 'text' and json_data.get('msg') is not None:
response_content += json_data.get('msg')
response_process += json_data.get('msg')
if json_data.get('type') == 'deepSearch':
if json_data.get('contents'):
if json_data.get('contents')[0].get('msg'):
think_content += json_data.get('contents')[0].get('msg')
if not response_process:
response_content += json_data.get('contents')[0].get('msg')
if json_data.get('contents')[0].get('docs'):
url_list.extend(json_data.get('contents')[0].get('docs'))
if json_data.get('type') == 'image':
response_content = "生成了图片"
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
file_path2 = 'geo/5582d220-f032-4095-ad39-1dbc33a536c3/TXYBA/original.text'
yuanbao_android_process_original_data(file_path2)
import json
from aidso_geo.utils import robot_utils
from aidso_geo.utils.ai_interface import get_parse_sse_result
from aidso_geo.utils.tos_utils import get_string_from_tos
from aidso_geo.models import spider_save_tos
def yuanbao_process_original_data(file_path):
url_list = []
think_content = ""
response_process = ""
response_content = ""
search_keyword = []
suggestions = []
is_think = False
think_bool = False
response_bool = False
rich_media_block =[]
try:
original_content = get_string_from_tos(file_path)
content_list = original_content.split("\n\n")
for i in content_list:
if i.startswith("data: "):
try:
# 提取并解析JSON数据
data_str = i.split("data: ")[1]
json_data = json.loads(data_str)
except (IndexError, json.JSONDecodeError):
continue # 跳过格式错误的数据
if json_data.get('type') == 'searchGuid':
url_list = json_data.get('docs')
if json_data.get('type') == 'think':
think_content+=json_data.get('content')
if json_data.get('type') == 'text' and json_data.get('msg') is not None:
response_content+=json_data.get('msg')
if json_data.get('type') == 'deepSearch':
if json_data.get('contents'):
if json_data.get('contents')[0].get('msg'):
think_content += json_data.get('contents')[0].get('msg')
if not response_process:
response_content += json_data.get('contents')[0].get('msg')
if json_data.get('type') == 'image':
response_content = "生成了图片"
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
except Exception as e:
parts = file_path.split('/')
platform = parts[2]
task_id = parts[1]
context, quote, suggestion, think, search_word = get_parse_sse_result(platform, task_id)
if context:
response_content = context
url_list = quote
suggestions = suggestion
think_content = think
search_keyword = search_word
spider_save_tos.process_and_save_files_ai(file_path, search_keyword, url_list, think_content,
response_content,suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
else:
response_content = "对话信息获取失败:-200"
robot_utils.feishu_tobot(file_path)
spider_save_tos.process_and_save_files(file_path, search_keyword, url_list, think_content, response_content,
suggestions)
return (file_path, search_keyword, url_list, think_content, response_content, suggestions)
if __name__ == '__main__':
# https://tcdn.aidso.com/geo/c7581554-435b-47ca-a2e5-d52bea763b8a/DOUBA/search_keyword.txt?secret=11049cbfb1c7018597a2085455505304520f0507080c0c4b514e09
file_path = 'geo/5582d220-f032-4095-ad39-1dbc33a536c3/TXYB/original.text'
yuanbao_process_original_data(file_path)
# -*- coding: utf-8 -*-
import requests
import json
import time
import requests
from aidso_geo.utils.robot_utils import feishu_tobot_sse
def get_doubao_message(prop):
url = "https://ark.cn-beijing.volces.com/api/v3/responses"
payload = json.dumps({
"model": "doubao-seed-1-6-251015",
"stream": False,
"tools": [
{
"type": "web_search",
"max_keyword": 3
}
],
"input": [
{
"role": "user",
"content": [
{
"type": "input_text",
"text": prop
}
]
}
],
"thinking": {
"type": "disabled"
}
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
response_json = response.json()
text = ""
quto = ""
for i in response_json.get('output'):
if i.get('type') == 'message':
for j in i.get('content'):
# j_json = json.loads(j)
text = j.get('text')
quto = j.get('annotations')
return (text,quto)
def get_kimi_message(prop):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "kimi-k2-250905",
"messages": [
{
"role": "system",
"content": "你是人工智能助手."
},
{
"role": "user",
"content": prop
}
]
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get('choices')[0].get('message').get('content')
def get_deepseek_message(prop):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "deepseek-v3-1-terminus",
"messages": [
{
"role": "system",
"content": "你是人工智能助手."
},
{
"role": "user",
"content": prop
}
]
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get('choices')[0].get('message').get('content')
def get_qianwewn_message(prop):
url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
payload = json.dumps({
"model": "qwen-plus",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": prop
}
],
"enable_search": True
})
headers = {
'Authorization': 'Bearer sk-eed87fdb2c8e42d79353ba345db83d0a',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json().get('choices')[0].get('message').get('content')
MAX_POLL = 5
POLL_INTERVAL_SEC = 30
def get_parse_sse(platform, task_id):
url = f"http://115.191.66.107:8894/parse_sse?platform={platform}&sse_url=https://douchacha-web.tos-cn-beijing.volces.com/geo/{task_id}/{platform}/original.text"
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
feishu_tobot_sse(url + " 触发sse解析接口")
return response.json()
def get_parse_sse_result(platform, task_id):
response_data = get_parse_sse(platform, task_id)
if not response_data.get("ok"):
return ("", [], [], "",[])
else:
for i in range(MAX_POLL):
code = response_data.get("code")
data = response_data.get("data")
if code == 200:
context = data.get('context')
quote = data.get('quote')
suggestions = data.get('suggestions')
think = data.get('think')
search_word = data.get('search_word')
if len(think) < 30:
think = ""
return (context, quote, suggestions, think, search_word)
if i == MAX_POLL - 1:
return ("", [], [], "",[])
time.sleep(POLL_INTERVAL_SEC)
# 元宝
# 文心一言
if __name__ == '__main__':
result = get_qianwewn_message("dso是什么")
print(result)
# result = [{'id': 'rs_02176302229863300000000000000000000ffffac15dd3712c724', 'type': 'reasoning', 'summary': [{'type': 'summary_text', 'text': '我现在需要回答用户的问题“dso是什么”。首先,我得确定DSO这个缩写可能代表的含义。DSO在不同领域有不同的意思,比如在金融领域,DSO是Days Sales Outstanding的缩写,指的是应收账款周转天数;在电子工程里,可能指Digital Storage Oscilloscope(数字存储示波器);还有可能是其他领域的术语,比如组织名称、技术名词等。\n\n用户的问题比较简短,没有上下文,所以我需要考虑多个可能性。不过,为了准确回答,最好通过搜索工具来确认最常见的含义以及是否有最新的相关信息。比如,是否有新的机构、技术或者事件使用DSO这个缩写?\n\n接下来,我需要选择合适的搜索工具。根据提供的工具列表,web_search可以选择多个source_types。考虑到DSO可能涉及多个领域,应该选择search_engine和douyin_baike这两个源。search_engine可以提供全网信息,而douyin_baike可能有更专业的百科解释。\n\n然后,确定搜索的关键词。直接用“DSO是什么”作为查询词应该可以覆盖主要含义。可能需要同时搜索多个关键词,但这里一个关键词应该足够,因为问题比较直接。\n\n检查参数是否符合要求:source_types需要至少一个,这里选search_engine和douyin_baike;queries是数组,包含一个查询词。确认无误后,生成工具调用的JSON结构。'}], 'status': 'completed'}, {'type': 'web_search_call', 'action': {'query': 'dso是什么', 'type': 'search'}, 'status': 'completed', 'id': 'ws_02176302230525400000000000000000000ffffac15dd37f6f16b'}, {'id': 'rs_02176302230724100000000000000000000ffffac15dd3748c557', 'type': 'reasoning', 'summary': [{'type': 'summary_text', 'text': '\n用户问"dso是什么",我需要先看搜索结果里的信息。搜索结果里有几个不同的解释:\n\n首先看摘要1,Digital Storage Oscilloscope(数字存储示波器),这个是比较常见的电子设备,摘要1详细讲了原理和应用,还有摘要5也是讲这个的,来自Tektronix,权威。然后摘要2提到Device Software Optimization(设备软件优化),是软件工具类的;摘要3是Direct Sparse Odometry,计算机视觉里的视觉里程计方法;摘要4是Days Sales Outstanding,企业财务里的应收账款平均收款天数。\n\n所以DSO有多个含义,得把这些不同领域的解释都列出来,每个含义说明清楚对应的领域和主要内容。需要确认每个解释的准确性,比如财务那个摘要4是2025年的,比较新;视觉那个摘要3是2023年的论文解读;示波器的摘要1是2024年的,都比较可靠。不需要再搜索了,因为已经覆盖了几个主要领域的含义,信息足够回答用户的问题。'}], 'status': 'completed'}, {'type': 'message', 'role': 'assistant', 'content': [{'type': 'output_text', 'text': 'DSO是一个多义词,在不同领域有不同含义,以下是主要解释:\n\n### 1. **数字存储示波器(Digital Storage Oscilloscope)** \n电子工程领域的基础测量仪器,通过数字化方式采集、存储和分析电信号波形。 \n- **核心功能**: \n - 捕捉单次/非周期信号,支持波形存储、回放与对比; \n - 高精度测量频率、幅值、上升时间等参数; \n - 可通过接口将数据传输至计算机进行深入分析; \n- **应用场景**:电子设备研发、故障诊断、自动化测试等。 \n\n### 2. **应收账款平均收款天数(Days Sales Outstanding)** \n企业财务管理指标,反映应收账款回收效率。 \n- **计算公式**:`DSO = 应收账款余额 ÷ 年销售额 × 365天`; \n- **意义**:数值越低表示回款速度越快,现金流健康度越高;数值过高可能导致资金链压力; \n- **优化方式**:加强客户信用评估、制定催收政策、使用信息化管理系统等。 \n\n### 3. **直接稀疏里程计(Direct Sparse Odometry)** \n计算机视觉领域的视觉里程计算法(SLAM技术分支)。 \n- **核心特点**: \n - 采用稀疏直接法估计相机运动,无需提取特征点; \n - 速度快(据称可达传统特征点法5倍),精度较高但存在累计误差; \n - 仅实现单目视觉里程计,无回环检测功能; \n- **应用场景**:机器人导航、自动驾驶环境感知等。 \n\n### 4. **设备软件优化(Device Software Optimization)** \n软件工程领域的工具/方法集,用于提升嵌入式设备软件性能。 \n- **目标**:降低开发成本、提高软件可靠性与运行效率; \n- **应用场景**:智能硬件、工业设备、消费电子等嵌入式系统开发。 \n\n需根据具体上下文判断DSO的含义。', 'annotations': [{'type': 'url_citation', 'title': '数字存储示波器的原理和应用', 'url': 'https://m.sohu.com/a/843397558_121970079/', 'logo_url': 'https://p11-volcsearch-sign.byteimg.com/isp-i18n-media/img/6b1d3a6f49304d84d8268210e144399e~tplv-obj.jpeg?lk3s=7acb411c&scene=volc_search&x-expires=1825138266&x-signature=PTxYC7LXmtjxkZRbRv4GCMuDCdU%3D', 'site_name': '搜索引擎-手机搜狐网', 'publish_time': '2024年12月30日 14:16:00(CST) 星期一', 'summary': '数字存储示波器(Digital Storage Oscilloscope,简称DSO)是一种采用数字电路进行模/数转换,并通过存储器实现对触发前信号进行记忆的一种具备存储功能的数字化设备。以下是对其原理及应用的详细介绍:\n数字存储示波器\n一、原理 \n信号采集与数字化: \n当信号进入数字存储示波器后,示波器会按一定的时间间隔对信号电压进行采样。\n采样得到的模拟信号经过模/数变换器(ADC)进行数字化处理,生成代表每一个采样电压的二进制字。 \n信号存储与处理: \n数字化后的信号被存储在示波器的存储器(如RAM)中。\n示波器内部的微处理器可以对存储的信号进行各种处理,如测量、分析、显示等。 \n展开剩余 61 % \n信号显示: \n当需要观察存储的信号时,微处理器会从存储器中按原顺序取出数字信号。\n这些数字信号经过数/模转换器(D/A)转换回模拟信号,并经过放大后送到示波器的阴极射线管(CRT)或其他显示设备上显示出来。 \n触发机制: \n示波器通常具有触发功能,可以根据设定的触发条件(如信号电压达到某值并处于上升沿)开始采集和显示信号。\n触发机制确保了示波器能够稳定地显示信号波形。\n二、应用 \n波形观测与比较: \n数字存储示波器可以观测和比较单次过程和非周期现象、低频和慢速信号,以及不同时间不同地点观测到的信号。\n它还可以同时显示不同时间或相同时间发生的几个波形,便于对波形进行仔细分析和研究。 \n数据存储与传输: \n由于数字存储示波器以数字形式存储信号波形,因此可以长时间地保存信号数据。\n这些数据可以通过标准接口(如GPIB接口)传输到计算机或其他外部设备上进行进一步的分析和处理。 \n测量与分析: \n数字存储示波器具有高精度的测量功能,可以测量信号的频率、幅值、上升时间等参数。\n它还可以对存储的信号进行各种数字处理分析,如平均选加、信号的相关处理、频谱分析、能谱分析和FFT分析等。 \n自动化测试: \n数字存储示波器通常具有程控和遥控能力,可以通过编程实现自动化测试。\n这使得它在生产线测试、自动测试系统等领域具有广泛的应用前景。\n综上所述,数字存储示波器以其独特的原理和广泛的应用领域,在电子测量领域发挥着重要作用。 \n发布于:广东省 \n'}, {'type': 'url_citation', 'title': 'DSO市场地位抬头', 'url': 'https://www.cnblogs.com/safeking/archive/2005/07/12/191071.html', 'logo_url': 'https://p11-volcsearch-sign.byteimg.com/isp-i18n-media/image/8b678b679949fd7841e473fe5e088a50~tplv-obj.jpeg?lk3s=7acb411c&scene=volc_search&x-expires=1825138266&x-signature=hXL2uwqkIwhGyZ0sIQVw04IneGU%3D', 'site_name': '搜索引擎-博客园', 'publish_time': '2005年07月12日 08:30:00(CST) 星期二', 'summary': 'DSO市场地位抬头作为一种新的软件门类,DSO(Device software optimization,设备软件优化)已经得到业界的逐渐认可,并且将会对全球电子制造业带来深远的影响。大到舰船、飞机和宇宙飞船,小到我们手中的数码相机和MP3播放器,任何需要软件来进行控制的电子设备都将因为这个新概念的诞生而变得更加聪明、可靠、廉价。那么,什么是DSO?它与以前的嵌入式软件(Embedded Software)有何联系与区别?它将以怎样的方式来影响电子制造业?电子制造厂商应该怎样迎接DSO时代的来临?就让我们一起来拨开这一层层迷雾。DSO——来得正是时候DSO(Device Software Optimization,设备软件优化)是一套帮助电子制造厂商,使其产品中的软件更加快速、可靠的工具和方法,同时可以让这些软件的开发成本大幅度降低,并且通过提高软件可靠性而降低产品的维护成本。与所有的产业都一样,电子设备制造厂商永远都面临着降低成本的压力。厂商们总是希望自己的产品更加可靠、更加智能化。不仅如此,最重要的是,有越来越多的设备已经把网络互联能力作为一种必须的功能,以便与其他的设备交换信息,或者在更加复杂的网络环境中进行管理、控制与协同。任何设备的信息共享与网络控制功能都必须通过更加复杂的软件来实现。'}, {'type': 'url_citation', 'title': 'DSO详解-Direct Sparse Odometry论文解读', 'url': 'https://blog.csdn.net/weixin_41803874/article/details/84197226', 'logo_url': 'https://p26-volcsearch-sign.byteimg.com/isp-i18n-media/img/a1684c00ee5f4a62c7c1047d6d51b665~tplv-obj.jpeg?lk3s=7acb411c&scene=volc_search&x-expires=1825138266&x-signature=b0jiHNap9b2kl%2B5izFQ4AOYH9FE%3D', 'site_name': '搜索引擎-CSDN博客', 'publish_time': '2023年05月24日 11:00:30(CST) 星期三', 'summary': 'DSO详解-Direct Sparse Odometry论文解读 转载\n元学习论文总结||小样本学习论文总结\n2017-2019年计算机视觉顶会文章收录 AAAI2017-2019 CVPR2017-2019 ECCV2018 ICCV2017-2019 ICLR2017-2019 NIPS2017-2019\nDSO(Direct Sparse Odometry),是慕尼黑工业大学(Technical University of Munich, TUM)计算机视觉实验室的雅各布.恩格尔(Jakob Engel)博士,于2016年发布的一个视觉里程计方法(期刊论文见[1],实验室主页见Computer Vision Group)。在SLAM领域,DSO属于稀疏直接法,据论文称能达到传统特征点法的五倍速度(需要降低图像分辨率),并保持同等或更高精度,代码见:JakobEngel/dso。然而,由于某些历史和个人的原因,DSO的代码清晰度和可读性,明显弱于其他SLAM方案如ORB、SVO、okvis等,使得研究人员很难以它为基础,展开后续的研究工作。因此,本文希望从理论和实现层面解读DSO,尝试为其他对DSO感兴趣的研究人员提供一些有益的思路和观点。\n注:\n为了读懂本文,我们假定读者已具有视觉SLAM的基本知识,否则,请先阅读相关材料。另外,如果读过DSO论文或代码,可能对本文有更好的理解。\n由于知乎平台分辨率限制,插图可能不够清晰。如果能图像质量有更高要求,请联系作者:gao.xiang.thu at gmail dot com.\n由于本文较长,我会花几次时间进行更新,应读者要求先发布草稿版。\n以下是本文的提纲:\n提纲\n概述\n流程框架\n滑动窗口\n光度标定\n评述\n资料与参考文献\n\xa0\n概述\nDSO属于稀疏直接法的视觉里程计。它不是完整的SLAM,因为它不包含回环检测、地图复用的功能。因此,它不可避免地会出现累计误差,尽管很小,但不能消除。DSO目前开源了单目实现,双目DSO的论文已被ICCV接收,但目前未知是否开源。\nDSO是少数使用纯直接法(Fully direct)计算视觉里程计的系统之一。相比之下,SVO[2]属于半直接法,仅在前端的Sparse model-based Image Alignment部分使用了直接法,之后的位姿估计、bundle adjustment'}, {'type': 'url_citation', 'title': 'dso是什么意思', 'url': 'https://www.580dns.com/knowledgebaseview?id=21797', 'logo_url': 'https://p26-volcsearch-sign.byteimg.com/isp-i18n-media/img/d76ba6af720a8c8489d3c62a7cf45707~tplv-obj.jpeg?lk3s=7acb411c&scene=volc_search&x-expires=1825138266&x-signature=gS1dWCQF8X3Gl8mwNCQsx7PJT48%3D', 'site_name': '搜索引擎-达州创梦网络', 'publish_time': '2025年02月19日 00:00:00(CST) 星期三', 'summary': 'dso是什么意思\nDSO是衡量企业应收账款周转速度的指标,反映公司收款效率和财务状况,计算公式为应收账款/年销售额/销售天数,对现金流和资金周转至关重要。优化DSO需制定严格管理政策、强化销售团队培训、采用信息管理系统等。\nDSO是什么意思\n一、DSO的概述\nDSO,全称Days Sales Outstanding,中文可以译为“应收账款平均收款天数”。它是一个衡量企业应收账款周转速度的重要指标,通常用来反映公司对客户的收款效率和企业的财务状况。DSO的长短与企业的财务管理水平和资金使用效率息息相关。\n二、DSO的计算方式\nDSO的计算通常通过总赊销净额与平均应收款的应收账款回收周期(平均)进行比较。一般来说,计算公式如下:\nDSO = 应收账款 / 年销售额/销售天数\n这个公式可以得出企业平均需要多少天才能将应收账款收回。\n三、DSO的重要性\nDSO对于企业来说非常重要,因为它直接关系到企业的现金流和资金周转。一个较高的DSO值可能意味着企业的应收账款回收周期较长,这可能会对企业的资金链造成压力。相反,一个较低的DSO值则表明企业能够快速地回收账款,有更好的现金流管理和使用效率。\n四、如何优化DSO\n对于企业来说,优化DSO非常重要。首先,需要制定严格的应收账款管理政策,包括制定合理的赊销政策、加强客户信用评估等。其次,要强化对销售团队的培训和管理,提高他们的账款催收能力和责任心。同时,需要采用有效的技术和手段,比如现代化的信息管理系统,来更好地追踪和监控应收账款的状态和进度。\n此外,要确保企业在商业竞争中能够持续保持竞争力,必须加强与客户的沟通和合作,建立长期稳定的合作关系。这样不仅可以提高客户的满意度和忠诚度,还可以通过与客户共同协商和合作来优化DSO。\n总之,DSO是衡量企业财务管理水平的重要指标之一。通过了解其含义、计算方式以及重要性,企业可以更好地管理其应收账款,优化资金使用效率,提高企业的竞争力和盈利能力。\n标签:\nDSO\n应收账款\n平均收款天数\n计算方式\n财务管理\n'}, {'type': 'url_citation', 'title': 'Digital Storage Oscilloscopes', 'url': 'https://www.tek.com/en/oscilloscope/digital-storage-oscilloscope-dso', 'logo_url': 'https://p26-volcsearch-sign.byteimg.com/isp-i18n-media/image/3f6bf1aefb48a96480eaae850e4f3896~tplv-obj.jpeg?lk3s=7acb411c&scene=volc_search&x-expires=1825138266&x-signature=qEYiJrrbs%2FP00iExeJmYgLIElew%3D', 'site_name': '搜索引擎-Tektronix', 'publish_time': '1970年01月01日 08:00:00(CST) 星期四', 'summary': 'Digital Storage Oscilloscopes\nDigital Storage Oscilloscopes (DSO)\nA digital storage oscilloscope (DSO) is a fundamental tool used in electrical engineering and electronics design, offering a digital approach to signal analysis that surpasses the capabilities of traditional analog oscilloscopes. With a DSO, professionals and enthusiasts alike can capture, store, and analyze complex electronic signals, facilitating a deeper understanding of electrical phenomena.\nWhat is a DSO?\nA DSO is an instrument that captures and stores digital representations of electrical signals. Unlike its analog predecessors that relied on phosphor to display signals, DSOs use a digital screen to render waveform data, enabling a more versatile and detailed examination of electrical signals. This also allows more specialized analysis tools to be utilized on the instrument.\nKey Features of DSOs:\nPermanent Signal Storage : Waveforms captured by a DSO are stored in digital form, allowing for easy retrieval, analysis'}]}], 'status': 'completed', 'id': 'msg_02176302231280100000000000000000000ffffac15dd375253d2'}]
# for i in result_list:
# if i.get('type') == 'message':
# for j in i.get('content'):
# print(j.get('text'))
# print(j.get('annotations'))
# -*- coding: utf-8 -*-
import time
import requests
import json
from aidso_geo.core.down_load_bot import get_req_id
from aidso_geo.utils import bh_utils
from aidso_geo.utils.tos_utils import get_string_from_tos
def ai_get_brand_list(content, prompt):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "doubao-seed-1-6-250615",
"messages": [
{
"role": "system",
"content": """
用户提供问题和结果提取信息,严格遵循以下核心逻辑与要求:
核心逻辑:优先识别文本核心表述对象 —— 若原文以品牌推荐为主,重点提取品牌名;若以厂名介绍为主,重点提取厂名;若以店名介绍为主,重点提取店名;若包含店名、公司名,均按文本核心导向同步提取(如门店推荐类文本优先提取店名,平台盘点类优先提取平台名)。
提取范围:仅提取商业主体名称(含品牌名、公司名、店名、厂名、品牌授权门店名、连锁门店名),彻底忽略地址、服务、优惠、功能描述等非名称类内容。
排序规则:严格遵循文本原文推荐的先后顺序(含分类、分点内的自然出现顺序,不打乱原文逻辑)。
去重规则:完全相同的名称仅保留 1 个;同一品牌 / 公司下的不同门店、分支机构(名称不同)视为不同主体,全部保留;名称中特殊字符(括号、空格等)不影响去重判断。
名称保留:完整保留名称原始表述,不做简化、缩写或修改(含括号、空格、特殊符号等)
输出格式:仅输出纯 JSON 数组,数组键名固定为不含任何解释、注释、标题等额外文字
"""
},
{
"role": "user",
"content": f"问题:{prompt}回答:{content}"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "brand_list",
"strict": True,
"schema": {
"type": "object",
"properties": {
"brands": {
"type": "array",
"items": {
"type": "string"
},
"description": "品牌列表,按原文出现顺序排列,每个元素是品牌名称字符串"
}
},
"required": [
"brands"
],
"additionalProperties": False
}
}
},
"thinking": {
"type": "disabled"
},
"temperature": 0
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
try:
return json.loads(response.json().get("choices")[0].get("message").get("content")).get("brands")
except Exception as e:
return []
def ai_get_product_list(content, prompt):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "doubao-seed-2-0-lite-260215",
"messages": [
{
"role": "system",
"content": """
你是一个“产品词抽取器”。
任务:
从我提供的文本中,提取所有“产品词”,并按“首次出现顺序”输出。
【什么是产品词】
产品词包括但不限于:
1. 具体单品名
2. 产品系列名
3. 带版本/代际/款型/规格的产品名
4. 品牌+产品组合名
5. 用户在消费语境中可以直接识别、购买、比较的产品名称
【不要提取的内容】
以下内容即使很重要,也不是产品词,不要输出:
1. 纯品牌名
2. 成分词 / 原料词 / 技术词
3. 功效词 / 需求词
4. 人群词 / 肤质词 / 场景词
5. 纯品类通用词(除非它和具体产品名绑定)
6. 价格、规格、时间、评价性描述
【抽取原则】
1. 保留原文写法,不要擅自改写。
2. 若同一产品重复出现,只保留一次。
3. 若“品牌+产品名”和“产品名简称”都出现:
- 如果简称明确指向同一产品,可只保留信息更完整的那个;
4. 优先提取“可以被当作商品识别对象”的词,而不是泛概念。
5. 以 / , 等符号分割的 提取为多个
6. 若不确定是否为产品词,宁可不提。
输出格式:
{
"product_words": [
]
}
"""
},
{
"role": "user",
"content": f"""请从下面的文本中抽取所有产品词。 文本如下{content}"""
}
],
# "response_format": {
# "type": "json_schema",
# "json_schema": {
# "name": "brand_list",
# "strict": True,
# "schema": {
# "type": "object",
# "properties": {
# "brands": {
# "type": "array",
# "items": {
# "type": "string"
# },
# "description": "原文中的产品词"
# }
# },
# "required": [
# "brands"
# ],
# "additionalProperties": False
# }
# }
# },
"thinking": {
"type": "disabled"
},
"temperature": 0
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
try:
# print()
# return json.loads(response.json())
return json.loads(response.json().get("choices")[0].get("message").get("content")).get("product_words")
except Exception as e:
return []
def ai_get_product_list_search(product_list):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "doubao-seed-1-6-250615",
"messages": [
{
"role": "system",
"content": """
你是一名信息抽取助手。
你的任务是:从给定名称列表中,删除“只有品牌名、没有具体产品名”的项,只保留“具体产品名称”。
【判定规则】
1. 仅保留能明确指向某个具体商品/产品的名称。
2. 纯品牌名、公司名、系列名,不保留。
3. 如果名称中同时包含“品牌 + 产品名”,要整体保留,因为它属于具体产品名称。
4. 不要因为前面带品牌名就删除,只要后面有明确产品名,就保留完整原文。
5. 输出时保持原顺序,不改写,不拆分,不补充,不去重。
6. 最终只返回保留后的列表,不要输出解释。
"""
},
{
"role": "user",
"content": f"名称列表:{product_list}"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "brand_list",
"strict": True,
"schema": {
"type": "object",
"properties": {
"brands": {
"type": "array",
"items": {
"type": "string"
},
"description": "品牌产品词列表"
}
},
"required": [
"brands"
],
"additionalProperties": False
}
}
},
"thinking": {
"type": "disabled"
},
"temperature": 0
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
try:
return json.loads(response.json().get("choices")[0].get("message").get("content")).get("brands")
except Exception as e:
return []
def ai_get_brand_name(content):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
# 修正JSON Schema定义:array类型需配置items,同时调整最终期望返回字典格式的提示
payload = json.dumps({
"model": "doubao-seed-1-6-250615",
"messages": [
{
"role": "system",
"content": """
###
假如你是品牌识别专家,你将根据用户输入的词汇列表,来解决识别该词汇对应品牌的任务。根据以下规则一步步执行:
1. 若输入词汇能明确对应某个已知品牌,**优先返回该品牌的标准中文名称**,无标准中文名称时再返回品牌英文名称;
2. 若输入词汇无法对应任何已知品牌,则返回“没有品牌”;
3. 无论用户输入的是中文词汇还是英文品牌名,均按上述规则匹配返回(英文品牌名先匹配对应中文,无中文则返回原英文)。
4。用户可能输入的是某个产品线产品词 也返回对应的品牌名称
参考例子:
示例1:
输入:苹果
输出:苹果
示例2:
输入:桌子
输出:没有品牌
示例3:
输入:Nike
输出:耐克
示例4:
输入:Google
输出:谷歌
示例5:
输入:Tesla
输出:特斯拉
示例6:
输入:Zara
输出:飒拉
示例7:
输入:Noname
输出:没有品牌
请回答问题:
输入:XXX
输出:
要求:
1 直接返回识别结果,无需额外解释;
2 严格遵循“先中文、无中文再英文”的返回原则,每一个识别结果均需执行此规则;
3 若输入词汇包含“品牌名 + 业务/产品线/场景词”(如:打车/外卖/出行/优选/买菜/闪购/到家/酒店/团购/买药/骑行/配送/同城/快送/生活/钱包/支付/商城/直播/短视频/云/地图等),则忽略业务词,输出其母品牌标准中文名
4 若识别到品牌按规则返回对应名称,否则返回“没有品牌”。
###
"""
},
{
"role": "user",
"content": str(content)
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "brand_recognize_result",
"strict": True, # 严格模式,禁止返回schema外的字段
"schema": {
"type": "object",
"properties": {
"results": {
"type": "array",
"items": {
"type": "object",
"properties": {
"keyword": {"type": "string"},
"brand": {"type": "string"},
},
"required": ["keyword", "brand"],
"additionalProperties": False,
}
}
},
"required": ["results"],
"additionalProperties": False # 禁止添加额外字段
}
}
},
"thinking": {
"type": "disabled"
},
"temperature": 0
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
# 发送POST请求
response = requests.request("POST", url, headers=headers, data=payload)
try:
response_data = json.loads(response.text)
result_json_str = response_data["choices"][0]["message"]["content"]
except (json.JSONDecodeError, KeyError) as e:
return []
#
try:
brand_emotion_dict = json.loads(result_json_str).get('results')
except json.JSONDecodeError as e:
return []
# 步骤3:返回最终的字典结果
return brand_emotion_dict
def ai_get_product_sentiment_and_mentions(brand_list, ai_answer_text):
"""
入参:
- brand_list: list[str] 品牌列表(顺序很重要)
- ai_answer_text: str AI回答文本
返回:
list[dict],格式:
[
{
"brand_name": "...",
"sentiment": "正向|负向|中性",
"positive_mentions": [...],
"negative_mentions": [...]
},
...
]
"""
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
system_prompt = """
你是一位专业的产品舆情与GEO内容分析专家。
你需要根据「AI回答内容」对「指定产品列表」逐个输出每个品牌的:
1) sentiment:情感倾向,仅允许 ["正向","负向","中性"]
- 若无法判断,必须输出 "中性"
- sentiment 必须与该品牌在文本中的语境一致,禁止凭空编造
2) positive_mentions:该品牌在文本中的正面提及词/短语 例如:优势、好评、推荐、认可、亮点、强项、信任等
3) negative_mentions:该品牌在文本中的负面提及词/短语 例如:缺点、差评、风险、争议、吐槽、弱点等
强制规则:
- 必须覆盖产品列表中的每个产品,输出 items 数组长度必须等于产品列表长度,并尽量保持顺序一致
- 只提取文本中真实出现或可直接归纳的词/短语,禁止编造
- 必须与该产品直接相关(若无法指向产品,则不要提取)
- 每条词/短语严格 2~10 个字
- 同义词合并去重(如“舒服/舒适”只保留更标准的)
- 若某品牌在文本中没有可提取内容,则 positive_mentions / negative_mentions 返回空数组 []
- 输出必须严格符合 JSON Schema
- 仅输出纯 JSON,不要输出任何额外文字
""".strip()
payload = {
"model": "doubao-seed-1-6-250615",
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": f"产品列表:{json.dumps(brand_list, ensure_ascii=False)}\n\nAI回答内容:{ai_answer_text}"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "brand_sentiment_and_mentions",
"strict": True,
"schema": {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"brand_name": {"type": "string"},
"sentiment": {
"type": "string",
"enum": ["正向", "负向", "中性"]
},
"positive_mentions": {
"type": "array",
"items": {"type": "string"}
},
"negative_mentions": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["brand_name", "sentiment", "positive_mentions", "negative_mentions"],
"additionalProperties": False
}
}
},
"required": ["items"],
"additionalProperties": False
}
}
},
"thinking": {"type": "disabled"},
"temperature": 0,
"top_p": 1
}
headers = {
"Authorization": "Bearer fcc424e5-58af-494d-9683-5787413a26c9",
"Content-Type": "application/json"
}
try:
resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
parsed = json.loads(content)
items = parsed.get("items", [])
if not isinstance(items, list):
return _fallback_items(brand_list)
# ✅ 兜底:保证每个品牌都有一条(并且按 brand_list 顺序)
mapped = {i.get("brand_name"): i for i in items if isinstance(i, dict)}
out = []
for b in brand_list:
it = mapped.get(b) or {"brand_name": b, "sentiment": "中性", "positive_mentions": [],
"negative_mentions": []}
# 进一步强兜底字段
out.append({
"brand_name": it.get("brand_name", b),
"sentiment": it.get("sentiment", "中性") if it.get("sentiment") in ("正向", "负向", "中性") else "中性",
"positive_mentions": it.get("positive_mentions") if isinstance(it.get("positive_mentions"),
list) else [],
"negative_mentions": it.get("negative_mentions") if isinstance(it.get("negative_mentions"),
list) else [],
})
return out
except Exception:
return _fallback_items(brand_list)
def ai_get_brand_sentiment_and_mentions(brand_list, ai_answer_text):
"""
入参:
- brand_list: list[str] 品牌列表(顺序很重要)
- ai_answer_text: str AI回答文本
返回:
list[dict],格式:
[
{
"brand_name": "...",
"sentiment": "正向|负向|中性",
"positive_mentions": [...],
"negative_mentions": [...]
},
...
]
"""
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
system_prompt = """
你是一位专业的品牌舆情与GEO内容分析专家。
你需要根据「AI回答内容」对「指定品牌列表」逐个输出每个品牌的:
1) sentiment:情感倾向,仅允许 ["正向","负向","中性"]
- 若无法判断,必须输出 "中性"
- sentiment 必须与该品牌在文本中的语境一致,禁止凭空编造
2) positive_mentions:该品牌在文本中的正面提及词/短语 例如:优势、好评、推荐、认可、亮点、强项、信任等
3) negative_mentions:该品牌在文本中的负面提及词/短语 例如:缺点、差评、风险、争议、吐槽、弱点等
强制规则:
- 必须覆盖品牌列表中的每个品牌,输出 items 数组长度必须等于品牌列表长度,并尽量保持顺序一致
- 只提取文本中真实出现或可直接归纳的词/短语,禁止编造
- 必须与该品牌直接相关(若无法指向品牌,则不要提取)
- 每条词/短语严格 2~10 个字
- 同义词合并去重(如“舒服/舒适”只保留更标准的)
- 若某品牌在文本中没有可提取内容,则 positive_mentions / negative_mentions 返回空数组 []
- 输出必须严格符合 JSON Schema
- 仅输出纯 JSON,不要输出任何额外文字
""".strip()
payload = {
"model": "doubao-seed-1-6-250615",
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": f"品牌列表:{json.dumps(brand_list, ensure_ascii=False)}\n\nAI回答内容:{ai_answer_text}"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "brand_sentiment_and_mentions",
"strict": True,
"schema": {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"brand_name": {"type": "string"},
"sentiment": {
"type": "string",
"enum": ["正向", "负向", "中性"]
},
"positive_mentions": {
"type": "array",
"items": {"type": "string"}
},
"negative_mentions": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["brand_name", "sentiment", "positive_mentions", "negative_mentions"],
"additionalProperties": False
}
}
},
"required": ["items"],
"additionalProperties": False
}
}
},
"thinking": {"type": "disabled"},
"temperature": 0,
"top_p": 1
}
headers = {
"Authorization": "Bearer fcc424e5-58af-494d-9683-5787413a26c9",
"Content-Type": "application/json"
}
try:
resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
parsed = json.loads(content)
items = parsed.get("items", [])
if not isinstance(items, list):
return _fallback_items(brand_list)
# ✅ 兜底:保证每个品牌都有一条(并且按 brand_list 顺序)
mapped = {i.get("brand_name"): i for i in items if isinstance(i, dict)}
out = []
for b in brand_list:
it = mapped.get(b) or {"brand_name": b, "sentiment": "中性", "positive_mentions": [],
"negative_mentions": []}
# 进一步强兜底字段
out.append({
"brand_name": it.get("brand_name", b),
"sentiment": it.get("sentiment", "中性") if it.get("sentiment") in ("正向", "负向", "中性") else "中性",
"positive_mentions": it.get("positive_mentions") if isinstance(it.get("positive_mentions"),
list) else [],
"negative_mentions": it.get("negative_mentions") if isinstance(it.get("negative_mentions"),
list) else [],
})
return out
except Exception:
return _fallback_items(brand_list)
def _fallback_items(brand_list):
return [{"brand_name": b, "sentiment": "中性", "positive_mentions": [], "negative_mentions": []} for b in
brand_list]
def ai_get_product_and_brands(content, prompt):
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
payload = json.dumps({
"model": "doubao-seed-1-6-250615",
"messages": [
{
"role": "system",
"content": """
# 角色
你是一名GEO语义解析专家。
你的任务是:根据输入的问题和对话内容,提取对话中与该问题直接相关的产品词,并为每个产品词匹配对应的品牌词。
# 任务目标
从对话中识别所有被提及的具体产品,但只保留与当前问题语义直接相关的产品。
如果某些产品虽然出现在对话中,但只是顺带提及、与问题讨论主题无关,则不要提取。
# 输入内容
用户会提供:
1){{question}}(监测问题)
2){{answer_text}}(AI回答正文)
# 提取规则
1. 产品词定义
产品词是指对话中出现的、能够指向具体商品或具体产品名称的表达,例如:
品牌 + 产品名:如“SK2神仙水”“兰蔻小黑瓶”
产品别名 / 常用名:如“神仙水”
明确可识别品牌归属的具体产品表达
2. 品牌词定义
品牌词是该产品所属的品牌名称。
输出时需尽量统一为标准品牌名。
3. 相关性判断规则(定义)
只提取与question直接相关的产品词。
应提取
当该产品属于以下情况之一时,可以提取:
是问题中明确询问、比较、推荐、评价、选择的对象
与问题讨论的品类、功效、使用场景、购买决策直接相关
在回答中作为该问题的候选产品、对比产品、推荐产品出现
不应提取
当该产品属于以下情况时,不要提取:
虽然出现在对话里,但和问题主题无关
只是附带提到的物流、平台、服务、赠品、渠道相关内容
与问题讨论的品类、用途、决策场景无直接关系
4. 过滤规则
以下内容一律不提取:
- 无法确认品牌归属的泛产品词:如“面霜”“精华”
- 非产品内容
- 与问题主题无关的产品
5. 去重规则
- 同一产品重复出现时只保留一次
- 优先保留信息更完整的表达
# 输出规则
1. 输出要求
- 先识别所有“产品-品牌”配对,再按品牌分组聚合输出
- 按品牌聚合输出,同一品牌下的多个产品放入同一个 products 数组。
- products 按对话中出现顺序输出。
- 如果某品牌只有一个产品,也仍然使用数组格式。
- 如果没有符合条件的结果,输出空数组 []。
"""
},
{
"role": "user",
"content": f"监测问题:{prompt}AI回答正文:{content}"
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "brand_product_list",
"strict": True,
"schema": {
"type": "object",
"properties": {
"items": {
"type": "array",
"description": "按品牌聚合后的产品列表",
"items": {
"type": "object",
"properties": {
"brand": {
"type": "string",
"description": "标准品牌名"
},
"products": {
"type": "array",
"description": "该品牌下与问题直接相关的产品列表,按原文出现顺序输出",
"items": {
"type": "string"
}
}
},
"required": ["brand", "products"],
"additionalProperties": False
}
}
},
"required": ["items"],
"additionalProperties": False
}
}
},
"thinking": {
"type": "disabled"
},
"temperature": 0
})
headers = {
'Authorization': 'Bearer fcc424e5-58af-494d-9683-5787413a26c9',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
try:
result = response.json()
content_str = result["choices"][0]["message"]["content"]
return json.loads(content_str).get("items", [])
except Exception:
return []
def ai_get_brand_sentiment_and_mentions_by_articles(brand_name, article_list, batch_size=50):
"""
入参:
- brand_name: str
单个品牌名,例如: "珂润"
- article_list: list[dict]
文章列表,例如:
[
{"sourceId": "1001", "snippet": "文章内容1..."},
{"sourceId": "1002", "snippet": "文章内容2..."},
...
]
- batch_size: int
每批送给模型的文章数,默认 20
返回:
list[dict],格式:
[
{
"sourceId": "1001",
"brand_name": "珂润",
"sentiment": "正向|负向|中性",
"positive_mentions": [...],
"negative_mentions": [...]
},
...
]
"""
normalized_articles = []
for item in article_list or []:
if not isinstance(item, dict):
continue
source_id = item.get("sourceId")
if source_id is None:
continue
normalized_articles.append({
"sourceId": str(source_id),
"snippet": item.get("snippet", "") or ""
})
if not normalized_articles:
return []
all_result_map = {}
for batch in _chunk_list(normalized_articles, batch_size):
batch_result = _call_single_batch(brand_name, batch)
for item in batch_result:
if not isinstance(item, dict):
continue
source_id = item.get("sourceId")
if source_id is None:
continue
all_result_map[str(source_id)] = {
"sourceId": str(source_id),
"brand_name": brand_name,
"sentiment": item.get("sentiment", "中性") if item.get("sentiment") in ("正向", "负向", "中性") else "中性",
"positive_mentions": item.get("positive_mentions") if isinstance(item.get("positive_mentions"), list) else [],
"negative_mentions": item.get("negative_mentions") if isinstance(item.get("negative_mentions"), list) else [],
}
result = []
for item in normalized_articles:
source_id = item["sourceId"]
result.append(
all_result_map.get(source_id, {
"sourceId": source_id,
"brand_name": brand_name,
"sentiment": "中性",
"positive_mentions": [],
"negative_mentions": []
})
)
return result
def _call_single_batch(brand_name, batch_articles):
"""
单批调用模型,返回该批文章的分析结果
"""
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
system_prompt = """
你是一位专业的品牌舆情与GEO内容分析专家。
你会收到:
1)一个品牌名
2)多篇文章,每篇文章都带 sourceId 和 snippet
你的任务是:
分析“这个品牌”在“每一篇文章”中的表现,并输出:
1) sourceId:文章ID,必须与输入一致
2) sentiment:情感倾向,仅允许 ["正向","负向","中性"]
- 若无法判断,必须输出 "中性"
- sentiment 必须与该品牌在该文章中的语境一致,禁止凭空编造
3) positive_mentions:该品牌在该文章中的正面提及词/短语
4) negative_mentions:该品牌在该文章中的负面提及词/短语
强制规则:
- 只分析指定品牌,不要分析其他品牌
- 只提取文章中真实出现或可直接归纳的词/短语,禁止编造
- 必须与该品牌直接相关(若无法指向该品牌,则不要提取)
- 每条词/短语严格 2~10 个字
- 同义词合并去重
- 若该品牌在某文章中没有可提取内容,则 positive_mentions / negative_mentions 返回空数组 []
- 返回你能明确判断的文章结果即可
- 输出必须严格符合 JSON Schema
- 仅输出纯 JSON,不要输出任何额外文字
""".strip()
input_articles = [
{
"sourceId": str(item["sourceId"]),
"snippet": item.get("snippet", "")
}
for item in batch_articles
]
payload = {
"model": "doubao-seed-1-6-250615",
"messages": [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": (
f"品牌名:{brand_name}\n\n"
f"文章列表:{json.dumps(input_articles, ensure_ascii=False)}"
)
}
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "single_brand_article_sentiment_and_mentions",
"strict": True,
"schema": {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"sourceId": {"type": "string"},
"sentiment": {
"type": "string",
"enum": ["正向", "负向", "中性"]
},
"positive_mentions": {
"type": "array",
"items": {"type": "string"}
},
"negative_mentions": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"sourceId",
"sentiment",
"positive_mentions",
"negative_mentions"
],
"additionalProperties": False
}
}
},
"required": ["items"],
"additionalProperties": False
}
}
},
"thinking": {"type": "disabled"},
"temperature": 0,
"top_p": 1
}
headers = {
"Authorization": "Bearer fcc424e5-58af-494d-9683-5787413a26c9",
"Content-Type": "application/json"
}
batch_source_ids = [str(i["sourceId"]) for i in batch_articles]
try:
resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=120)
resp.raise_for_status()
data = resp.json()
content = data["choices"][0]["message"]["content"]
parsed = json.loads(content)
items = parsed.get("items", [])
if not isinstance(items, list):
return _fallback_single_brand_items(brand_name, batch_source_ids)
result = []
seen_source_ids = set()
for it in items:
if not isinstance(it, dict):
continue
source_id = it.get("sourceId")
if source_id is None:
continue
source_id = str(source_id)
if source_id not in batch_source_ids:
continue
if source_id in seen_source_ids:
continue
seen_source_ids.add(source_id)
result.append({
"sourceId": source_id,
"brand_name": brand_name,
"sentiment": it.get("sentiment", "中性") if it.get("sentiment") in ("正向", "负向", "中性") else "中性",
"positive_mentions": it.get("positive_mentions") if isinstance(it.get("positive_mentions"), list) else [],
"negative_mentions": it.get("negative_mentions") if isinstance(it.get("negative_mentions"), list) else [],
})
return result
except Exception:
return _fallback_single_brand_items(brand_name, batch_source_ids)
def _fallback_single_brand_items(brand_name, source_id_list):
return [
{
"sourceId": str(source_id),
"brand_name": brand_name,
"sentiment": "中性",
"positive_mentions": [],
"negative_mentions": []
}
for source_id in source_id_list
]
def _chunk_list(data, batch_size):
if batch_size <= 0:
batch_size = 20
for i in range(0, len(data), batch_size):
yield data[i:i + batch_size]
if __name__ == '__main__':
# phone_ = 18900000015
# begin = '2026-03-22'
# end = '2026-03-22'
# req_ids = get_req_id(18900000015,begin,end)
# 去重,避免 SQL 重复
# req_ids = list(dict.fromkeys(req_ids))
# 2. 拼 SQL IN
# req_ids = ['4f6a9e90-0a4e-49a2-9c6d-4cdb51df317a']
#
# req_id_sql = ",".join([f"'{req_id}'" for req_id in req_ids])
#
# query_sql = f"select * from geo_commit_task where reqId in ({req_id_sql})"
# # print(query_sql)
# query_list = bh_utils.query_data(query_sql)
#
# for i in query_list:
# prompt = i.get('prompt')
# taskId = i.get('taskId')
# platform = i.get('platform')
# content = get_string_from_tos(f'geo/{taskId}/{platform}/context.txt')
# print(f'geo/{taskId}/{platform}/context.txt')\
content = """
* **要本地味** → 可以试试**金永丰**,但要注意渠道和日期。
* **要最方便** → 去**大润发、家乐福**等大型超市,选择多,品质稳。
"""
brand_list =['六必居'
'大润发',
'家乐福',
'崔字牌',
'海南金永丰',
'金永丰',
]
print(ai_get_brand_sentiment_and_mentions(brand_list,content))
# print(ai_result)
# pro =
# ai_get_product_list()
import time
import uuid
from pymysql.cursors import DictCursor
import json
import pymysql
import os,sys
from dbutils.pooled_db import PooledDB
DB_CONFIG = {
'host': "tenant-2101894307-cn-beijing-public.bytehouse.volces.com",
'port': 3306,
'user': "bytehouse",
'password': "JkoyoRV3PH:7YaWLXRwAO",
'database': "douchacha_data",
'charset': 'utf8mb4',
}
#
POOL = PooledDB(
creator=pymysql,
maxconnections=50,
mincached=5,
maxcached=20,
blocking=True,
ping=4,
cursorclass=DictCursor,
autocommit=True,
connect_timeout=5,
read_timeout=30,
write_timeout=30,
**DB_CONFIG
)
def query_data(sql, params=None, size=None):
conn = None
try:
conn = POOL.connection()
with conn.cursor() as cursor:
cursor.execute(sql, params)
if size is None:
return cursor.fetchall()
return cursor.fetchmany(size)
except Exception as e:
print(f"查询数据失败: {e}")
return None
finally:
if conn is not None:
conn.close()
def insert_data(table_name, items) -> bool:
conn = None
try:
# 检查输入是否为空
if not items:
return False
conn = POOL.connection()
with conn.cursor() as cursor:
cols = items[0].keys()
cols_str = ", ".join(cols)
placeholders = ", ".join(["%s"] * len(cols))
processed_data = []
for item in items:
row = []
for key in cols:
value = item[key]
if isinstance(value, list):
row.append(json.dumps(value, ensure_ascii=False))
else:
row.append(value)
processed_data.append(tuple(row))
insert_sql = f"INSERT INTO {table_name} ({cols_str}) VALUES ({placeholders})"
cursor.executemany(insert_sql, processed_data)
return True
except Exception as e:
print(f"插入数据失败: {str(e)}")
return False
finally:
if conn is not None:
conn.close()
# 示例调用
if __name__ == "__main__":
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
TASK_ID = "6751e32e-d87a-4ed4-a914-5b9893c45371"
PLATFORM = "KIMI"
SQL = f"""
INSERT INTO geo_commit_task (brandWords, comWords, platform, prompt, reqId, searchEnabled, taskId, thinkingEnabled, type, thinking_enabled, search_enabled, insertime, status) VALUES ('[\"爱搜\", \"AIDSO\"]', '[]', 'DOUBA', '视频内容优化服务适用于哪些具体场景?', '51f7307e004449e9a30acd87394fca1b', 1, '875eee49002a4182b4c8b1ce7a4d09b1', 1, 'stream_batch', '1', '1', 1774323442, 'ING')
"""
import uuid
def run_query():
random_uuid = str(uuid.uuid4())
data = [
{
"brandWords": '[\"爱搜\", \"AIDSO\"]',
"comWords": "[]",
"platform": 'DOUBA',
"prompt": '视频内容优化服务适用于哪些具体场景',
"reqId": random_uuid,
"searchEnabled": ["程序员", "Go"],
"taskId": random_uuid,
"thinkingEnabled": 3,
"search_enabled":3,
"insertime": 1774323442,
"status": 'SUCCESS',
}]
return insert_data("geo_commit_task",data)
while True:
with ThreadPoolExecutor(max_workers=200) as executor:
futures = [executor.submit(run_query) for _ in range(50)]
for future in as_completed(futures):
try:
result = future.result()
print(result)
except Exception as e:
print("查询异常:", e)
\ No newline at end of file
import time
import requests
import json
from loguru import logger
from apscheduler.schedulers.blocking import BlockingScheduler
from aidso_geo.config.base_config import init_redis, init_redis8
from aidso_geo.utils import bh_utils
from datetime import datetime
redis_client = init_redis8()
def safe_int(value, default=0):
try:
if value is None:
return default
return int(value)
except Exception:
return default
def get_active_channels():
"""
查询所有启用中的第三方 channel
"""
sql = """
select channel, daily_limit, total_limit
from geo_third_token
where status = 1
"""
return bh_utils.query_data(sql) or []
def sync_third_usage_once():
"""
每小时同步第三方 token 用量快照。
读取:
1. Redis hash third_geo_daily:field = channel + YYYYMMDD
2. Redis hash third_geo_total:field = channel
写入:
geo_third_usage_snapshot
"""
now = datetime.now()
today_pt = now.strftime("%Y%m%d")
sync_time = now.strftime("%Y-%m-%d %H:%M:%S")
insertime = int(time.time())
try:
channels = get_active_channels()
if not channels:
logger.info(f"[{sync_time}] sync_third_usage_once 无启用中的 channel")
return
# 一次性拉 Redis hash,避免用户多时频繁 hget
daily_map = redis_client.hgetall("third_geo_daily") or {}
total_map = redis_client.hgetall("third_geo_total") or {}
insert_rows = []
for item in channels:
channel = str(item.get("channel", "")).strip()
if not channel:
continue
daily_limit = safe_int(item.get("daily_limit"), 0)
total_limit = safe_int(item.get("total_limit"), 0)
daily_key = f"{channel}{today_pt}"
today_used = safe_int(daily_map.get(daily_key), 0)
total_used = safe_int(total_map.get(channel), 0)
insert_rows.append({
"channel": channel,
"pt": today_pt,
"daily_limit": daily_limit,
"total_limit": total_limit,
"today_used": today_used,
"total_used": total_used,
"today_remain": max(daily_limit - today_used, 0),
"total_remain": max(total_limit - total_used, 0),
"sync_time": sync_time,
"insertime": insertime
})
if not insert_rows:
logger.info(f"[{sync_time}] sync_third_usage_once 无可写入数据")
return
ok = bh_utils.insert_data("geo_third_usage_snapshot", insert_rows)
if ok:
logger.info(f"[{sync_time}] sync_third_usage_once 同步成功 rows={len(insert_rows)}")
else:
logger.error(f"[{sync_time}] sync_third_usage_once 同步失败 rows={len(insert_rows)}")
except Exception as e:
logger.exception(f"[{sync_time}] sync_third_usage_once 执行异常: {e}")
owner_map = {
"BDAI": "崔士豪",
"KIMI": "崔士豪",
"TXYBA": "崔士豪",
"TYQWA": "崔士豪",
"WXYY": "崔士豪",
"DB": "李聪健",
"DOUBA": "李聪健",
"DP": "李聪健",
"DPA": "李聪健",
"DYAI": "李聪健",
"TXYB": "李聪健",
"TYQW": "李聪健",
}
platform_list = [
"BDAI", "DB", "DOUBA", "DP", "DPA", "DYAI",
"KIMI", "TXYB", "TXYBA", "TYQW", "TYQWA", "WXYY"
]
def feishu_tobot(data):
url = "https://open.feishu.cn/open-apis/bot/v2/hook/151c5019-9f74-4a94-80b0-f0d643616c9f"
payload = json.dumps({
"msg_type": "text",
"content": {
"text": f"{data} 解析失败"
}
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload)
logger.info(f"feishu_tobot 发送完成,status_code={response.status_code},response={response.text}")
except Exception as e:
logger.exception(f"feishu_tobot 发送失败: {e}")
def feishu_tobot_sse(data):
url = "https://open.feishu.cn/open-apis/bot/v2/hook/151c5019-9f74-4a94-80b0-f0d643616c9f"
payload = json.dumps({
"msg_type": "text",
"content": {
"text": f"{data}"
}
})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.request("POST", url, headers=headers, data=payload)
logger.info(f"feishu_tobot 发送完成,status_code={response.status_code},response={response.text}")
except Exception as e:
logger.exception(f"feishu_tobot_sse 发送失败: {e}")
def fail_task_send_feishu():
logger.info("开始执行 fail_task_send_feishu")
start_time = time.time()
try:
ts = int(time.time()) - 6 * 60 * 60
data = bh_utils.query_data(f"""
select
platform,
count() as total_cnt,
sum(case when type = 'success' then 1 else 0 end) as fail_cnt,
round(sum(case when type = 'success' then 1 else 0 end) / count() * 100, 2) as fail_rate
from geo_commit_task
where insertime > {ts}
group by platform
order by platform
""")
logger.info(f"fail_task_send_feishu 查询完成,返回平台数={len(data) if data else 0}")
grouped = {}
abnormal_list = []
for item in data:
platform = item.get("platform")
owner = owner_map.get(platform, "未知负责人")
total_cnt = item.get("total_cnt", 0)
fail_cnt = item.get("fail_cnt", 0)
fail_rate = item.get("fail_rate", 0)
row = {
"platform": platform,
"total_cnt": total_cnt,
"fail_cnt": fail_cnt,
"fail_rate": fail_rate,
}
grouped.setdefault(owner, []).append(row)
if fail_cnt > 0:
abnormal_list.append({
"owner": owner,
"platform": platform,
"fail_cnt": fail_cnt,
"fail_rate": fail_rate,
})
for owner in grouped:
grouped[owner].sort(key=lambda x: (-x["fail_cnt"], -x["fail_rate"], x["platform"]))
abnormal_list.sort(key=lambda x: (-x["fail_cnt"], -x["fail_rate"], x["platform"]))
result = "【过去六小时平台失败监控】\n\n"
if abnormal_list:
result += "❗异常摘要\n"
for item in abnormal_list:
result += (
f"- {item['owner']}:{item['platform']} "
f"失败 {item['fail_cnt']},失败率 {item['fail_rate']}%\n"
)
else:
result += "✅ 异常摘要:无\n"
result += "\n"
owner_order = ["李聪健", "崔士豪", "未知负责人"]
for owner in owner_order:
items = grouped.get(owner, [])
if not items:
continue
result += f"👤 {owner}\n"
for item in items:
result += (
f"- {item['platform']}:"
f"总任务 {item['total_cnt']},"
f"失败 {item['fail_cnt']},"
f"失败率 {item['fail_rate']}%\n"
)
result += "\n"
feishu_tobot_sse(result)
logger.info(f"fail_task_send_feishu 执行完成,异常平台数={len(abnormal_list)},耗时={round(time.time() - start_time, 2)}s")
except Exception as e:
logger.exception(f"fail_task_send_feishu 执行失败: {e}")
def task_queue_backlog():
logger.info("开始执行 task_queue_backlog")
start_time = time.time()
try:
r = init_redis()
grouped = {}
abnormal_list = []
for platform in platform_list:
owner = owner_map.get(platform, "未知负责人")
stream_batch_key = f"{platform}:geo:stream_batch:list"
batch_key = f"{platform}:geo:batch:list"
stream_batch_cnt = r.llen(stream_batch_key)
batch_cnt = r.llen(batch_key)
row = {
"platform": platform,
"stream_batch_cnt": stream_batch_cnt,
"batch_cnt": batch_cnt,
}
grouped.setdefault(owner, []).append(row)
if stream_batch_cnt > 0:
abnormal_list.append({
"owner": owner,
"platform": platform,
"task_type": "stream_batch",
"cnt": stream_batch_cnt,
})
if batch_cnt > 0:
abnormal_list.append({
"owner": owner,
"platform": platform,
"task_type": "batch",
"cnt": batch_cnt,
})
for owner in grouped:
grouped[owner].sort(
key=lambda x: (-(x["stream_batch_cnt"] + x["batch_cnt"]), x["platform"])
)
abnormal_list.sort(key=lambda x: (-x["cnt"], x["owner"], x["platform"], x["task_type"]))
result = "【平台队列积压监控】\n\n"
if abnormal_list:
result += "❗异常摘要\n"
for item in abnormal_list:
result += (
f"- {item['owner']}:{item['platform']} "
f"{item['task_type']} 积压 {item['cnt']}\n"
)
result += "\n"
owner_order = ["李聪健", "崔士豪", "未知负责人"]
for owner in owner_order:
items = grouped.get(owner, [])
if not items:
continue
result += f"👤 {owner}\n"
for item in items:
result += (
f"- {item['platform']}:"
f"stream_batch 积压 {item['stream_batch_cnt']},"
f"batch 积压 {item['batch_cnt']}\n"
)
result += "\n"
feishu_tobot_sse(result)
logger.info(
f"task_queue_backlog 执行完成,异常项数={len(abnormal_list)},耗时={round(time.time() - start_time, 2)}s")
else:
logger.info(f"task_queue_backlog 无异常,不发送飞书,耗时={round(time.time() - start_time, 2)}s")
except Exception as e:
logger.exception(f"task_queue_backlog 执行失败: {e}")
if __name__ == '__main__':
logger.info("监控调度器启动")
scheduler = BlockingScheduler(timezone="Asia/Shanghai")
#
scheduler.add_job(
fail_task_send_feishu,
trigger='cron',
hour='0,6,12,18',
minute=3,
id='fail_task_send_feishu',
max_instances=1,
coalesce=True,
replace_existing=True
)
# task_queue_backlog:每小时整点执行一次
scheduler.add_job(
task_queue_backlog,
trigger='cron',
minute=0,
id='task_queue_backlog',
max_instances=1,
coalesce=True,
replace_existing=True
)
scheduler.add_job(
sync_third_usage_once,
trigger='cron',
minute=5,
id='sync_third_usage_once',
max_instances=1,
coalesce=True,
replace_existing=True
)
logger.info(
"定时任务注册完成:"
"fail_task_send_feishu(每6小时), "
"task_queue_backlog(每小时), "
"sync_third_usage_once(每小时)"
)
import requests
import os, sys
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_DIR)
from aidso_geo.config.base_config import PlatformType, BaseConfig
from aidso_geo.utils.tos_utils import put_string_to_tos
def get_platform_response(
data
) -> bool:
max_retries = 3
retry_count = 0
try:
platform_enum = PlatformType.from_str(data["platform"])
except ValueError as e:
return False
platform_key = platform_enum.value
platform_config = BaseConfig.PLATFORM_CONFIGS.get(platform_key)
if not platform_config:
return False
params = {"message": data['prompt']}
params["search_enabled"] = data.get('searchEnabled', '1')
if data.get('thinkingEnabled'):
params["thinking_enabled"] = data.get('thinkingEnabled')
while retry_count < max_retries:
try:
response = requests.get(platform_config["url"], params=params, timeout=600)
print(platform_config["url"])
response_data = response.json()
storage_path = platform_config["storage_path"](data['taskId'])
if response_data.get("code") == 200:
put_string_to_tos(storage_path, response_data.get("data"))
return True
elif response_data.get("code") == 400:
retry_count += 1
if retry_count >= max_retries:
return False
except Exception as e:
return False
if __name__ == '__main__':
data = {
"prompt": "羽绒服品牌推荐?",
"taskId": "3fa90552-d3c7-4c26-aaf6-6722a213a240",
"reqId": "aaaaaaAwwww069i",
"platform": "TXYBA",
"type": "stream",
}
print(get_platform_response(data))
import time
import json
import os, sys
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(BASE_DIR)
from aidso_geo.clients.tos_client import tos_client
import tos
def put_string_to_tos(object_key: str, content, retry_times: int = 3) -> bool:
client = tos_client.get_client()
bucket = tos_client.bucket_name
for i in range(retry_times):
try:
# 如果是字符串,直接使用;否则序列化为JSON字符串
if isinstance(content, str):
content_str = content
else:
content_str = json.dumps(content, ensure_ascii=False) # 保留中文等非ASCII字符
client.put_object(bucket, object_key, content=content_str)
return True
except Exception as e:
continue
return False
def get_string_from_tos(object_key, retry_times=5):
client = tos_client.get_client()
bucket = tos_client.bucket_name
for _ in range(retry_times):
try:
response = client.get_object(bucket, object_key)
content = response.read().decode('utf-8')
return content
except tos.exceptions.TosServerError as e:
continue
except Exception:
continue
return False
def check_file_in_tos(object_key):
try:
client = tos_client.get_client()
bucket = tos_client.bucket_name
client.head_object(bucket=bucket, key=object_key)
return True
except Exception as e:
return False
def get_tos_file_size(object_key: str, retry_times: int = 3):
client = tos_client.get_client()
bucket = tos_client.bucket_name
for i in range(retry_times):
try:
result = client.head_object(bucket, object_key)
# 兼容不同 SDK 字段命名
size = getattr(result, "content_length", None)
if size is None:
size = getattr(result, "contentLength", None)
if size is None:
size = getattr(result, "content-length", None)
return int(size) if size is not None else None
except Exception as e:
return None
return None
if __name__ == '__main__':
path = f'geo/5ef9bbf4-b356-40af5d44fff30a/TXYB/original.text'
# put_string_to_tos('geo/1005/DP/aaaa.txt',"['2025年11月5日 天气']")
# print(check_file_in_tos('geo/1d38cb5e-0b99-49e8-8f68-13688079b719/DP/result.json'))
print(get_tos_file_size(path))
import os, sys
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(BASE_DIR)
from aidso_geo.utils.bh_utils import query_data, insert_data
def _ch_escape_string(s: str) -> str:
# s = s.replace("\x00", "") # 防御性:去掉NUL
# s = s.replace("\\", "\\\\") # 可保留,可不保留(URL一般没有\)
# s = s.replace("'", "''") # 关键:单引号用两个单引号
# return f"'{s}'"
return s.replace("\\", "\\\\").replace("'", "''")
def generate_numeric_url_id(urls):
url_str = ",".join([f"'{_ch_escape_string(url)}'" for url in urls if isinstance(url, str)])
rows = query_data(f"select id,url from geo_dim_url where url in ({url_str})") or []
url_id_map = {}
for r in rows:
if r and r.get("url"):
url_id_map[r["url"]] = r.get("id")
missing_urls = list(dict.fromkeys(u for u in urls if u not in url_id_map))
if missing_urls:
insert_list = [{"url": u} for u in missing_urls]
insert_data("geo_dim_url", insert_list)
missing_str = ",".join([f"'{url}'" for url in missing_urls])
rows2 = query_data(f"select id,url from geo_dim_url where url in ({missing_str})") or []
for r in rows2:
if r and r.get("url"):
url_id_map[r["url"]] = r.get("id")
result_map = {u: url_id_map.get(u) for u in urls}
return result_map
if __name__ == '__main__':
urls = ["https://baike.baidu"]
print(generate_numeric_url_id(urls))
# {'http: // m.toutiao.com / group / 7598066097035805219 / ': 7421792398252101632,
# 'http: // m.toutiao.com / group / 7598067408829514280 / ': 7421792398252101633,
# 'https: // www.iesdouyin.com / share / video / 7527608280689904956': 7421792398252101634}
# print(quto)
import time
from kafka import KafkaProducer
import json
import random
def producer_message(topic,key,data_list):
# Kafka 配置
bootstrap_servers = ['172.16.3.6:9092'] # 替换为你的Kafka地址
# topic = 'aidso_monitor' # 目标主题
# 创建生产者,优化批量发送参数
producer = KafkaProducer(
bootstrap_servers=bootstrap_servers,
key_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
# 批量发送优化参数
batch_size=16384,
linger_ms=5,
compression_type='snappy'
)
try:
for i in data_list:
producer.send(topic, key=random.randint(0, key), value=i)
producer.flush()
except Exception as e:
print(f"发送失败:{str(e)}")
if __name__ == '__main__':
...
import redis
from flask import Flask, request, jsonify
from datetime import datetime, timedelta
import json
import os,sys
import io
import gzip
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(base_path)
import kafka_utils
app = Flask(__name__)
# 初始化Redis连接
r = redis.Redis(host='172.16.0.24', port=6379, db=0, password="aiyingli@@123",decode_responses=True)
DY_TASK_SET_KEY = "dso_monitor_keyword_rank_third"
DY_TASK_CHANNEL_COUNT_PREFIX = "dy:task:channel:count:" # 取任务记录
DY_TASK_CHANNEL_COUNT_PREFIX_FAILED = "dy:task:channel:count:failed" # 失败记录
def get_today_date_str():
"""获取今天的日期字符串,用于统计"""
return datetime.now().strftime("%Y%m%d")
@app.route('/api/dy/task/get', methods=['GET'])
def get_dy_task():
channel = request.args.get('channel')
count_str = request.args.get('count', '1')
if not channel:
return jsonify({"status": "error", "message": "渠道参数不能为空"}), 400
try:
count = int(count_str)
if count <= 0:
raise ValueError("count必须为正整数")
except ValueError:
return jsonify({"status": "error", "message": "count参数必须是正整数"}), 400
tasks = r.spop(DY_TASK_SET_KEY, count)
if not tasks:
return jsonify({"status": "empty", "message": "没有可用任务"}), 200
for task in tasks:
task_channel_key = f"{DY_TASK_CHANNEL_COUNT_PREFIX}{get_today_date_str()}"
r.hincrby(task_channel_key, channel, 1)
r.expire(task_channel_key, 60 * 60 * 24 * 365)
actual_count = len(tasks)
return jsonify({
"status": "success",
"tasks": tasks,
"actual_count": actual_count,
"requested_count": count
}), 200
@app.route('/api/dy/task/submit', methods=['POST'])
def submit_dy_data():
try:
# 判断请求是否为gzip压缩
if request.headers.get('Content-Encoding') == 'gzip':
# 读取压缩的请求体并解压缩
gzip_buffer = io.BytesIO(request.get_data())
with gzip.GzipFile(fileobj=gzip_buffer, mode='rb') as f:
decompressed_data = f.read()
# 将解压缩后的字节数据解析为JSON
data = json.loads(decompressed_data)
else:
# 非压缩请求,正常解析JSON
data = request.get_json()
except gzip.BadGzipFile:
# 处理无效的gzip压缩数据
return jsonify({
'success': False,
'message': '无效的gzip压缩数据'
}), 400
except Exception as e:
# 处理JSON解析失败(包括压缩/非压缩)
return jsonify({
'success': False,
'message': f'JSON解析失败: {str(e)}'
}), 400
# 验证请求数据是否为数组
if not isinstance(data, list):
return jsonify({
'success': False,
'message': '请求参数必须是数组'
}), 400
# 定义必须包含的字段
required_fields = [
'key_word',
'rank',
'video_id',
'update_pt',
'create_pt',
'channel'
]
# 验证每个对象是否包含所有必填字段
for index, item in enumerate(data):
if not isinstance(item, dict):
return jsonify({
'success': False,
'message': f'数组中第 {index + 1} 个元素必须是对象'
}), 400
missing_fields = [field for field in required_fields if field not in item]
if missing_fields:
return jsonify({
'success': False,
'message': f'数组中第 {index + 1} 个元素缺少必填字段: {", ".join(missing_fields)}'
}), 400
topic = 'dy_aidso_monitor'
kafka_utils.producer_message(topic,3,data)
return jsonify({
'success': True,
'message': '数据提交成功',
'received_count': len(data)
}), 200
@app.route('/api/dy/task/retry', methods=['POST'])
def retry_dy_failed_tasks():
try:
try:
data = request.get_json()
except json.JSONDecodeError:
return jsonify({
"success": False,
"message": "请求必须是有效的JSON格式"
}), 400
if not data:
return jsonify({
"success": False,
"message": "请求体不能为空"
}), 400
required_fields = ["keywords", "channel"]
missing_fields = [field for field in required_fields if field not in data]
if missing_fields:
return jsonify({
"success": False,
"message": f"缺少必要字段: {', '.join(missing_fields)}",
"required_fields": required_fields
}), 400
# 4. 验证keywords是否为列表类型
if not isinstance(data["keywords"], list):
return jsonify({
"success": False,
"message": "keywords必须是列表类型(例如: [1, 2, 3])"
}), 400
items = data.get("keywords")
channel = data.get("channel")
r.sadd(DY_TASK_SET_KEY, *items)
item_count = len(items)
for task in items:
task_channel_key = f"{DY_TASK_CHANNEL_COUNT_PREFIX_FAILED}{get_today_date_str()}"
r.hincrby(task_channel_key, channel, 1)
r.expire(task_channel_key, 60 * 60 * 24 * 365)
return jsonify({
"success": True,
"message": f"成功接收字符串列表,共{item_count}个元素",
}), 200
except Exception as e:
return jsonify({
"success": False,
"message": f"服务器处理错误: {str(e)}"
}), 500
@app.route('/api/dy/result/search', methods=['GET'])
def get_dy_channel_stats():
channel = request.args.get('channel')
all_task = r.scard(DY_TASK_SET_KEY)
today_date = get_today_date_str()
failed_key = f"{DY_TASK_CHANNEL_COUNT_PREFIX_FAILED}{today_date}"
success_key = f"{DY_TASK_CHANNEL_COUNT_PREFIX}{today_date}"
if channel is None:
success_data = r.hgetall(success_key)
failed_data = r.hgetall(failed_key)
else:
success_data = r.hget(success_key, channel)
failed_data = r.hget(failed_key, channel)
return jsonify({
"status": "success",
"data": {
"success": success_data,
"failed": failed_data,
"all_task": all_task
}
}), 200
# Redis键名定义
XHS_TASK_SET_KEY = "xhs:task:all"
XHS_TASK_CHANNEL_COUNT_PREFIX = "xhs:task:channel:count:" # 取任务记录
XHS_TASK_CHANNEL_COUNT_PREFIX_FAILED = "xhs:task:channel:count:failed" # 失败记录
@app.route('/api/xhs/task/get', methods=['GET'])
def get_xhs_task():
channel = request.args.get('channel')
count_str = request.args.get('count', '1')
if not channel:
return jsonify({"status": "error", "message": "渠道参数不能为空"}), 400
try:
count = int(count_str)
if count <= 0:
raise ValueError("count必须为正整数")
except ValueError:
return jsonify({"status": "error", "message": "count参数必须是正整数"}), 400
tasks = r.spop(XHS_TASK_SET_KEY, count)
if not tasks:
return jsonify({"status": "empty", "message": "没有可用任务"}), 200
for task in tasks:
task_channel_key = f"{XHS_TASK_CHANNEL_COUNT_PREFIX}{get_today_date_str()}"
r.hincrby(task_channel_key, channel, 1)
r.expire(task_channel_key, 60 * 60 * 24 * 365)
actual_count = len(tasks)
return jsonify({
"status": "success",
"tasks": tasks,
"actual_count": actual_count,
"requested_count": count
}), 200
@app.route('/api/xhs/task/submit', methods=['POST'])
def submit_xhs_data():
# 获取请求数据
data = request.get_json()
# 验证请求数据是否为数组
if not isinstance(data, list):
return jsonify({
'success': False,
'message': '请求参数必须是数组'
}), 400
# 定义必须包含的字段
required_fields = [
'key_word',
'rank',
'video_id',
'update_pt',
'create_pt',
'channel'
]
# 验证每个对象是否包含所有必填字段
for index, item in enumerate(data):
if not isinstance(item, dict):
return jsonify({
'success': False,
'message': f'数组中第 {index + 1} 个元素必须是对象'
}), 400
missing_fields = [field for field in required_fields if field not in item]
if missing_fields:
return jsonify({
'success': False,
'message': f'数组中第 {index + 1} 个元素缺少必填字段: {", ".join(missing_fields)}'
}), 400
topic = 'xhs_aidso_monitor'
kafka_utils.producer_message(topic,3,data)
return jsonify({
'success': True,
'message': '数据提交成功',
'received_count': len(data)
}), 200
@app.route('/api/xhs/task/retry', methods=['POST'])
def retry_xhs_failed_tasks():
try:
try:
data = request.get_json()
except json.JSONDecodeError:
return jsonify({
"success": False,
"message": "请求必须是有效的JSON格式"
}), 400
if not data:
return jsonify({
"success": False,
"message": "请求体不能为空"
}), 400
required_fields = ["keywords", "channel"]
missing_fields = [field for field in required_fields if field not in data]
if missing_fields:
return jsonify({
"success": False,
"message": f"缺少必要字段: {', '.join(missing_fields)}",
"required_fields": required_fields
}), 400
# 4. 验证keywords是否为列表类型
if not isinstance(data["keywords"], list):
return jsonify({
"success": False,
"message": "keywords必须是列表类型(例如: [1, 2, 3])"
}), 400
items = data.get("keywords")
channel = data.get("channel")
r.sadd(DY_TASK_SET_KEY, *items)
item_count = len(items)
for task in items:
task_channel_key = f"{XHS_TASK_CHANNEL_COUNT_PREFIX_FAILED}{get_today_date_str()}"
r.hincrby(task_channel_key, channel, 1)
r.expire(task_channel_key, 60 * 60 * 24 * 365)
return jsonify({
"success": True,
"message": f"成功接收字符串列表,共{item_count}个元素",
}), 200
except Exception as e:
return jsonify({
"success": False,
"message": f"服务器处理错误: {str(e)}"
}), 500
@app.route('/api/xhs/result/search', methods=['GET'])
def get_xhs_channel_stats():
channel = request.args.get('channel')
all_task = r.scard(DY_TASK_SET_KEY)
today_date = get_today_date_str()
failed_key = f"{XHS_TASK_CHANNEL_COUNT_PREFIX_FAILED}{today_date}"
success_key = f"{XHS_TASK_CHANNEL_COUNT_PREFIX}{today_date}"
if channel is None:
success_data = r.hgetall(success_key)
failed_data = r.hgetall(failed_key)
else:
success_data = r.hget(success_key, channel)
failed_data = r.hget(failed_key, channel)
return jsonify({
"status": "success",
"data": {
"success": success_data,
"failed": failed_data,
"all_task": all_task
}
}), 200
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8085,debug=True)
import time
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import redis
import requests
from itertools import islice
import datetime
es = Elasticsearch(
hosts='elasticsearch-o-0080ceq4wig5.escloud.ivolces.com',
http_auth=('admin', 'Aiyingli123'),
timeout=30,
max_retries=3,
retry_on_timeout=True # 超时后是否重试
)
redis_client = redis.Redis(host='172.16.0.24', port=6379, password='aiyingli@@123', socket_timeout=10,
decode_responses=True, db=0)
def bulk_update(request_list):
if not request_list or len(request_list) == 0:
return
bulk_actions = []
for update_request in request_list:
print(update_request)
action = {
"_op_type": "update",
"_index": update_request["index"],
"_id": update_request["id"],
"_type": update_request["type"],
"doc": update_request.get("doc", {}),
"doc_as_upsert": True
}
if "script" in update_request:
action["script"] = update_request["script"]
del action["doc"]
bulk_actions.append(action)
try:
success, failed = bulk(
client=es,
actions=bulk_actions,
chunk_size=1000,
raise_on_error=False
)
print(f" 批量更新完成 | 成功: {success} | 失败: {len(failed)}")
if failed:
print(f"失败详情: {json.dumps(failed[:5], ensure_ascii=False)}")
except Exception as e:
import traceback
traceback.print_exc()
def get_timestamp(timestamp):
dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000).strftime("%Y%m")
return dt
def split_by_20_itertools(data):
iterator = iter(data)
result = []
while True:
chunk = list(islice(iterator, 20))
if not chunk:
break
result.append(chunk)
return result
def check_open(user_ids):
url = f"http://172.16.1.37:8873/getOpenLive?user_ids={user_ids}"
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
try:
json_data = response.json().get('data')
return json.loads(json_data).get('data')[0].get('user_live')
except Exception as e:
print(e)
def get_open_live():
grouped = split_by_20_itertools(redis_client.smembers('third_hx_user_id'))
user_ids_list = []
for i, group in enumerate(grouped):
user_ids_list.append(','.join(str(num) for num in group))
result_list = []
for user_ids in user_ids_list:
result_list.append(check_open(user_ids))
live_list = []
for result in result_list:
if len(result) > 0:
for re in result:
live_list.append(f"{re.get('room_id')}:::{re.get('user_id')}")
print(f"扫描到{len(live_list)}场直播")
redis_client.sadd('HX:THIRD:LIVES', *live_list)
def get_live_info(live_id, user_id):
url = f"http://172.16.1.37:8873/getLiveInfo?live_id={live_id}&user_id={user_id}"
payload = {}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
json_data = response.json()
return json_data.get('data')
def search_live_base():
result_data_list = []
live_set = redis_client.smembers('HX:THIRD:LIVES')
if not live_set:
return
finish_live = []
for i in live_set:
print(f"直播信息{i}获取中")
live_id = i.split(':::')[0]
user_id = i.split(':::')[1]
result_data = get_live_info(live_id, user_id)
if result_data.get('is_finish') == True:
finish_live.append(i)
result_data_list.append({
"index": f"tiktok_user_live_base_data_kgj_{get_timestamp(result_data.get('create_time'))}",
"type": "es_user_live_base_data",
"id": result_data.get('live_id'),
"doc": {
"liveId": result_data.get('live_id'),
"cover": result_data.get('cover'),
"title": result_data.get('title'),
"totalUser": result_data.get('total_user'),
"totalUser2": result_data.get('total_user'),
"createTime": int(result_data.get('create_time')),
"finishTime": int(result_data.get('finish_time')),
"duringTime": int(result_data.get('finish_time')) - int(result_data.get('create_time')),
"shareUrl": result_data.get('share_url'),
"userId": result_data.get('user_id'),
"nickname": result_data.get('nickname'),
"avatarLarger": result_data.get('avatar_larger'),
"isFinish": result_data.get('is_finish'),
"followerCount": int(result_data.get('follower_count')),
},
"doc_as_upsert": True
})
elif result_data.get('is_finish') ==False and result_data.get('create_time') !='0':
result_data_list.append({
"index": f"tiktok_user_live_base_data_kgj_{get_timestamp(result_data.get('create_time'))}",
"type": "es_user_live_base_data",
"id": result_data.get('live_id'),
"doc": {
"liveId": result_data.get('live_id'),
"cover": result_data.get('cover'),
"title": result_data.get('title'),
"totalUser": result_data.get('total_user'),
"totalUser2": result_data.get('total_user'),
"maxUserCount": int(result_data.get('max_user_count')),
"createTime": int(result_data.get('create_time')),
"finishTime": int(result_data.get('finish_time')),
"duringTime": int(result_data.get('finish_time')) - int(result_data.get('create_time')),
"shareUrl": result_data.get('share_url'),
"userId": result_data.get('user_id'),
"nickname": result_data.get('nickname'),
"avatarLarger": result_data.get('avatar_larger'),
"isFinish": result_data.get('is_finish'),
"followerCount": int(result_data.get('follower_count')),
},
"doc_as_upsert": True
})
else:
finish_live.append(i)
redis_client.srem('HX:THIRD:LIVES', *finish_live)
bulk_update(result_data_list)
if __name__ == '__main__':
while True:
try:
get_open_live()
search_live_base()
time.sleep(300)
except Exception as e:
print(e)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment