386 lines
15 KiB
Python
386 lines
15 KiB
Python
"""
|
||
小说质量评估模块
|
||
提供多维度的质量评估和改进建议
|
||
"""
|
||
|
||
import re
|
||
import json
|
||
import logging
|
||
from typing import Dict, List, Tuple, Any
|
||
from dataclasses import dataclass
|
||
from collections import Counter
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
@dataclass
|
||
class QualityMetrics:
|
||
"""质量指标"""
|
||
# 基础指标
|
||
total_chars: int
|
||
effective_chars: int
|
||
paragraph_count: int
|
||
sentence_count: int
|
||
dialogue_count: int
|
||
|
||
# 比例指标
|
||
dialogue_ratio: float
|
||
description_ratio: float
|
||
action_ratio: float
|
||
|
||
# 复杂度指标
|
||
avg_sentence_length: float
|
||
vocabulary_richness: float
|
||
repetition_rate: float
|
||
|
||
# 结构指标
|
||
scene_transitions: int
|
||
character_mentions: Dict[str, int]
|
||
emotion_words: int
|
||
|
||
# 质量评分
|
||
overall_score: float
|
||
dimension_scores: Dict[str, float]
|
||
|
||
|
||
class NovelQualityAssessor:
|
||
"""小说质量评估器"""
|
||
|
||
def __init__(self):
|
||
# 情感词汇库
|
||
self.emotion_words = {
|
||
"positive": ["开心", "快乐", "兴奋", "满足", "温暖", "感动", "幸福", "甜蜜", "欣慰", "骄傲"],
|
||
"negative": ["难过", "痛苦", "愤怒", "失望", "恐惧", "焦虑", "绝望", "孤独", "委屈", "后悔"],
|
||
"complex": ["复杂", "矛盾", "纠结", "挣扎", "无奈", "感慨", "思考", "反思", "领悟", "成长"]
|
||
}
|
||
|
||
# 动作词汇
|
||
self.action_words = ["走", "跑", "坐", "站", "看", "听", "说", "想", "做", "拿", "放", "开", "关", "推", "拉"]
|
||
|
||
# 描写词汇
|
||
self.description_words = ["美丽", "漂亮", "高大", "宽敞", "明亮", "温暖", "寒冷", "炎热", "安静", "嘈杂"]
|
||
|
||
# 对话标记
|
||
self.dialogue_patterns = [
|
||
r'[【「"]([^【「"]+)[】」"]', # 【】「」""标记的对话
|
||
r'"([^"]+)"', # 英文引号
|
||
r"'([^']+)'", # 中文引号
|
||
]
|
||
|
||
def assess_novel_quality(self, text: str, title: str = "") -> QualityMetrics:
|
||
"""评估小说质量"""
|
||
logger.info("开始质量评估...")
|
||
|
||
# 基础统计
|
||
basic_stats = self._calculate_basic_stats(text)
|
||
|
||
# 内容分析
|
||
content_analysis = self._analyze_content(text)
|
||
|
||
# 结构分析
|
||
structure_analysis = self._analyze_structure(text)
|
||
|
||
# 语言质量分析
|
||
language_analysis = self._analyze_language_quality(text)
|
||
|
||
# 计算综合评分
|
||
dimension_scores = {
|
||
"内容丰富度": content_analysis["richness_score"],
|
||
"结构合理性": structure_analysis["structure_score"],
|
||
"语言质量": language_analysis["language_score"],
|
||
"情感表达": content_analysis["emotion_score"],
|
||
"对话质量": content_analysis["dialogue_score"]
|
||
}
|
||
|
||
overall_score = sum(dimension_scores.values()) / len(dimension_scores)
|
||
|
||
# 构建质量指标对象
|
||
metrics = QualityMetrics(
|
||
total_chars=basic_stats["total_chars"],
|
||
effective_chars=basic_stats["effective_chars"],
|
||
paragraph_count=basic_stats["paragraph_count"],
|
||
sentence_count=basic_stats["sentence_count"],
|
||
dialogue_count=basic_stats["dialogue_count"],
|
||
|
||
dialogue_ratio=content_analysis["dialogue_ratio"],
|
||
description_ratio=content_analysis["description_ratio"],
|
||
action_ratio=content_analysis["action_ratio"],
|
||
|
||
avg_sentence_length=language_analysis["avg_sentence_length"],
|
||
vocabulary_richness=language_analysis["vocabulary_richness"],
|
||
repetition_rate=language_analysis["repetition_rate"],
|
||
|
||
scene_transitions=structure_analysis["scene_transitions"],
|
||
character_mentions=structure_analysis["character_mentions"],
|
||
emotion_words=content_analysis["emotion_words"],
|
||
|
||
overall_score=overall_score,
|
||
dimension_scores=dimension_scores
|
||
)
|
||
|
||
logger.info(f"质量评估完成,综合评分: {overall_score:.1f}")
|
||
return metrics
|
||
|
||
def _calculate_basic_stats(self, text: str) -> Dict[str, Any]:
|
||
"""计算基础统计信息"""
|
||
# 总字符数
|
||
total_chars = len(text)
|
||
|
||
# 有效字符数(中文字符+标点+数字)
|
||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||
combined = ''.join(lines)
|
||
chinese_chars = re.findall(r'[\u4e00-\u9fa5]', combined)
|
||
punctuation = re.findall(r'[,。!?;:、""''()【】《》]', combined)
|
||
numbers = re.findall(r'\d', combined)
|
||
effective_chars = len(chinese_chars) + len(punctuation) + len(numbers)
|
||
|
||
# 段落数
|
||
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
||
paragraph_count = len(paragraphs)
|
||
|
||
# 句子数
|
||
sentences = re.split(r'[。!?]', text)
|
||
sentence_count = len([s for s in sentences if s.strip()])
|
||
|
||
# 对话数
|
||
dialogue_count = 0
|
||
for pattern in self.dialogue_patterns:
|
||
dialogue_count += len(re.findall(pattern, text))
|
||
|
||
return {
|
||
"total_chars": total_chars,
|
||
"effective_chars": effective_chars,
|
||
"paragraph_count": paragraph_count,
|
||
"sentence_count": sentence_count,
|
||
"dialogue_count": dialogue_count
|
||
}
|
||
|
||
def _analyze_content(self, text: str) -> Dict[str, Any]:
|
||
"""分析内容质量"""
|
||
# 对话比例
|
||
dialogue_chars = 0
|
||
for pattern in self.dialogue_patterns:
|
||
matches = re.findall(pattern, text)
|
||
dialogue_chars += sum(len(match) for match in matches)
|
||
|
||
dialogue_ratio = dialogue_chars / len(text) if text else 0
|
||
|
||
# 描写比例(通过描写词汇估算)
|
||
description_count = 0
|
||
for word in self.description_words:
|
||
description_count += text.count(word)
|
||
description_ratio = min(description_count / 100, 0.5) # 归一化
|
||
|
||
# 动作比例
|
||
action_count = 0
|
||
for word in self.action_words:
|
||
action_count += text.count(word)
|
||
action_ratio = min(action_count / 100, 0.5) # 归一化
|
||
|
||
# 情感词汇统计
|
||
emotion_words = 0
|
||
for category, words in self.emotion_words.items():
|
||
for word in words:
|
||
emotion_words += text.count(word)
|
||
|
||
# 评分计算
|
||
richness_score = min(100, (dialogue_ratio * 30 + description_ratio * 40 + action_ratio * 30) * 100)
|
||
emotion_score = min(100, emotion_words * 2) # 每个情感词2分
|
||
dialogue_score = min(100, dialogue_ratio * 200) # 对话比例评分
|
||
|
||
return {
|
||
"dialogue_ratio": dialogue_ratio,
|
||
"description_ratio": description_ratio,
|
||
"action_ratio": action_ratio,
|
||
"emotion_words": emotion_words,
|
||
"richness_score": richness_score,
|
||
"emotion_score": emotion_score,
|
||
"dialogue_score": dialogue_score
|
||
}
|
||
|
||
def _analyze_structure(self, text: str) -> Dict[str, Any]:
|
||
"""分析结构质量"""
|
||
# 场景转换(通过时间、地点词汇估算)
|
||
scene_markers = ["突然", "接着", "然后", "后来", "此时", "这时", "同时", "与此同时", "第二天", "几天后"]
|
||
scene_transitions = 0
|
||
for marker in scene_markers:
|
||
scene_transitions += text.count(marker)
|
||
|
||
# 角色提及统计(简单的人名识别)
|
||
# 这里使用简单的启发式方法,实际应用中可以使用NER
|
||
potential_names = re.findall(r'[\u4e00-\u9fa5]{2,3}(?=[,。!?:;]|说|想|看|听)', text)
|
||
character_mentions = Counter(potential_names)
|
||
|
||
# 只保留出现频率较高的(可能是角色名)
|
||
character_mentions = {name: count for name, count in character_mentions.items() if count >= 3}
|
||
|
||
# 结构评分
|
||
structure_score = min(100, scene_transitions * 5 + len(character_mentions) * 10)
|
||
|
||
return {
|
||
"scene_transitions": scene_transitions,
|
||
"character_mentions": character_mentions,
|
||
"structure_score": structure_score
|
||
}
|
||
|
||
def _analyze_language_quality(self, text: str) -> Dict[str, Any]:
|
||
"""分析语言质量"""
|
||
# 句子平均长度
|
||
sentences = re.split(r'[。!?]', text)
|
||
valid_sentences = [s.strip() for s in sentences if s.strip()]
|
||
avg_sentence_length = sum(len(s) for s in valid_sentences) / len(valid_sentences) if valid_sentences else 0
|
||
|
||
# 词汇丰富度(简单估算)
|
||
words = re.findall(r'[\u4e00-\u9fa5]+', text)
|
||
unique_words = set(words)
|
||
vocabulary_richness = len(unique_words) / len(words) if words else 0
|
||
|
||
# 重复率(检查重复的短语)
|
||
phrases = []
|
||
for i in range(len(text) - 10):
|
||
phrase = text[i:i+10]
|
||
if re.match(r'^[\u4e00-\u9fa5,。!?;:、""''()【】《》]+$', phrase):
|
||
phrases.append(phrase)
|
||
|
||
phrase_counts = Counter(phrases)
|
||
repeated_phrases = sum(1 for count in phrase_counts.values() if count > 1)
|
||
repetition_rate = repeated_phrases / len(phrases) if phrases else 0
|
||
|
||
# 语言质量评分
|
||
length_score = min(100, max(0, 100 - abs(avg_sentence_length - 20) * 2)) # 理想句长20字
|
||
richness_score = vocabulary_richness * 100
|
||
repetition_score = max(0, 100 - repetition_rate * 200)
|
||
|
||
language_score = (length_score + richness_score + repetition_score) / 3
|
||
|
||
return {
|
||
"avg_sentence_length": avg_sentence_length,
|
||
"vocabulary_richness": vocabulary_richness,
|
||
"repetition_rate": repetition_rate,
|
||
"language_score": language_score
|
||
}
|
||
|
||
def generate_improvement_suggestions(self, metrics: QualityMetrics) -> List[str]:
|
||
"""生成改进建议"""
|
||
suggestions = []
|
||
|
||
# 基于各维度评分给出建议
|
||
if metrics.dimension_scores["内容丰富度"] < 70:
|
||
suggestions.append("建议增加更多的场景描写和人物内心活动,提升内容丰富度")
|
||
|
||
if metrics.dimension_scores["结构合理性"] < 70:
|
||
suggestions.append("建议优化故事结构,增加场景转换和情节推进")
|
||
|
||
if metrics.dimension_scores["语言质量"] < 70:
|
||
suggestions.append("建议提升语言表达质量,避免重复用词,丰富词汇")
|
||
|
||
if metrics.dimension_scores["情感表达"] < 70:
|
||
suggestions.append("建议增强情感表达,多使用情感词汇和内心独白")
|
||
|
||
if metrics.dimension_scores["对话质量"] < 70:
|
||
suggestions.append("建议增加对话内容,让角色通过对话展现性格")
|
||
|
||
# 基于具体指标给出建议
|
||
if metrics.dialogue_ratio < 0.2:
|
||
suggestions.append("对话比例偏低,建议增加角色对话来推进情节")
|
||
|
||
if metrics.avg_sentence_length > 30:
|
||
suggestions.append("句子平均长度偏长,建议适当使用短句增加节奏感")
|
||
|
||
if metrics.repetition_rate > 0.1:
|
||
suggestions.append("存在较多重复表达,建议使用同义词替换增加表达多样性")
|
||
|
||
if len(metrics.character_mentions) < 3:
|
||
suggestions.append("角色数量偏少,建议增加配角丰富故事内容")
|
||
|
||
return suggestions
|
||
|
||
def export_quality_report(self, metrics: QualityMetrics, title: str = "", output_file: str = None) -> str:
|
||
"""导出质量报告"""
|
||
report = f"""
|
||
# 小说质量评估报告
|
||
|
||
## 基本信息
|
||
- 标题: {title or "未命名"}
|
||
- 总字符数: {metrics.total_chars:,}
|
||
- 有效字符数: {metrics.effective_chars:,}
|
||
- 段落数: {metrics.paragraph_count}
|
||
- 句子数: {metrics.sentence_count}
|
||
- 对话数: {metrics.dialogue_count}
|
||
|
||
## 综合评分: {metrics.overall_score:.1f}/100
|
||
|
||
## 各维度评分
|
||
- 内容丰富度: {metrics.dimension_scores['内容丰富度']:.1f}/100
|
||
- 结构合理性: {metrics.dimension_scores['结构合理性']:.1f}/100
|
||
- 语言质量: {metrics.dimension_scores['语言质量']:.1f}/100
|
||
- 情感表达: {metrics.dimension_scores['情感表达']:.1f}/100
|
||
- 对话质量: {metrics.dimension_scores['对话质量']:.1f}/100
|
||
|
||
## 详细指标
|
||
- 对话比例: {metrics.dialogue_ratio:.1%}
|
||
- 描写比例: {metrics.description_ratio:.1%}
|
||
- 动作比例: {metrics.action_ratio:.1%}
|
||
- 平均句长: {metrics.avg_sentence_length:.1f}字
|
||
- 词汇丰富度: {metrics.vocabulary_richness:.1%}
|
||
- 重复率: {metrics.repetition_rate:.1%}
|
||
- 场景转换: {metrics.scene_transitions}次
|
||
- 情感词汇: {metrics.emotion_words}个
|
||
|
||
## 角色提及统计
|
||
"""
|
||
|
||
for character, count in metrics.character_mentions.items():
|
||
report += f"- {character}: {count}次\n"
|
||
|
||
# 添加改进建议
|
||
suggestions = self.generate_improvement_suggestions(metrics)
|
||
if suggestions:
|
||
report += "\n## 改进建议\n"
|
||
for i, suggestion in enumerate(suggestions, 1):
|
||
report += f"{i}. {suggestion}\n"
|
||
|
||
# 保存到文件
|
||
if output_file:
|
||
try:
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write(report)
|
||
logger.info(f"质量报告已保存到: {output_file}")
|
||
except Exception as e:
|
||
logger.error(f"保存质量报告失败: {str(e)}")
|
||
|
||
return report
|
||
|
||
|
||
def assess_novel_quality(text: str, title: str = "") -> QualityMetrics:
|
||
"""便捷函数:评估小说质量"""
|
||
assessor = NovelQualityAssessor()
|
||
return assessor.assess_novel_quality(text, title)
|
||
|
||
|
||
def generate_quality_report(text: str, title: str = "", output_file: str = None) -> str:
|
||
"""便捷函数:生成质量报告"""
|
||
assessor = NovelQualityAssessor()
|
||
metrics = assessor.assess_novel_quality(text, title)
|
||
return assessor.export_quality_report(metrics, title, output_file)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 测试代码
|
||
sample_text = """
|
||
《测试小说》
|
||
|
||
李明走进办公室,心情复杂。今天是他入职的第一天,既兴奋又紧张。
|
||
|
||
【你好,我是新来的李明。】他对前台小姐说道。
|
||
|
||
【欢迎!我是小王,有什么需要帮助的尽管说。】小王热情地回答。
|
||
|
||
李明感到一阵温暖,看来这里的同事都很友善。他想起了母亲昨天的话:【要好好工作,不要让我们失望。】
|
||
|
||
突然,一个严肃的声音响起:【你就是新来的?跟我来。】
|
||
"""
|
||
|
||
metrics = assess_novel_quality(sample_text, "测试小说")
|
||
print(f"综合评分: {metrics.overall_score:.1f}")
|
||
print(f"对话比例: {metrics.dialogue_ratio:.1%}")
|
||
print(f"平均句长: {metrics.avg_sentence_length:.1f}") |