""" 小说质量评估模块 提供多维度的质量评估和改进建议 """ import re import json import logging from typing import Dict, List, Tuple, Any from dataclasses import dataclass from collections import Counter logger = logging.getLogger(__name__) @dataclass class QualityMetrics: """质量指标""" # 基础指标 total_chars: int effective_chars: int paragraph_count: int sentence_count: int dialogue_count: int # 比例指标 dialogue_ratio: float description_ratio: float action_ratio: float # 复杂度指标 avg_sentence_length: float vocabulary_richness: float repetition_rate: float # 结构指标 scene_transitions: int character_mentions: Dict[str, int] emotion_words: int # 质量评分 overall_score: float dimension_scores: Dict[str, float] class NovelQualityAssessor: """小说质量评估器""" def __init__(self): # 情感词汇库 self.emotion_words = { "positive": ["开心", "快乐", "兴奋", "满足", "温暖", "感动", "幸福", "甜蜜", "欣慰", "骄傲"], "negative": ["难过", "痛苦", "愤怒", "失望", "恐惧", "焦虑", "绝望", "孤独", "委屈", "后悔"], "complex": ["复杂", "矛盾", "纠结", "挣扎", "无奈", "感慨", "思考", "反思", "领悟", "成长"] } # 动作词汇 self.action_words = ["走", "跑", "坐", "站", "看", "听", "说", "想", "做", "拿", "放", "开", "关", "推", "拉"] # 描写词汇 self.description_words = ["美丽", "漂亮", "高大", "宽敞", "明亮", "温暖", "寒冷", "炎热", "安静", "嘈杂"] # 对话标记 self.dialogue_patterns = [ r'[【「"]([^【「"]+)[】」"]', # 【】「」""标记的对话 r'"([^"]+)"', # 英文引号 r"'([^']+)'", # 中文引号 ] def assess_novel_quality(self, text: str, title: str = "") -> QualityMetrics: """评估小说质量""" logger.info("开始质量评估...") # 基础统计 basic_stats = self._calculate_basic_stats(text) # 内容分析 content_analysis = self._analyze_content(text) # 结构分析 structure_analysis = self._analyze_structure(text) # 语言质量分析 language_analysis = self._analyze_language_quality(text) # 计算综合评分 dimension_scores = { "内容丰富度": content_analysis["richness_score"], "结构合理性": structure_analysis["structure_score"], "语言质量": language_analysis["language_score"], "情感表达": content_analysis["emotion_score"], "对话质量": content_analysis["dialogue_score"] } overall_score = sum(dimension_scores.values()) / len(dimension_scores) # 构建质量指标对象 metrics = QualityMetrics( total_chars=basic_stats["total_chars"], effective_chars=basic_stats["effective_chars"], paragraph_count=basic_stats["paragraph_count"], sentence_count=basic_stats["sentence_count"], dialogue_count=basic_stats["dialogue_count"], dialogue_ratio=content_analysis["dialogue_ratio"], description_ratio=content_analysis["description_ratio"], action_ratio=content_analysis["action_ratio"], avg_sentence_length=language_analysis["avg_sentence_length"], vocabulary_richness=language_analysis["vocabulary_richness"], repetition_rate=language_analysis["repetition_rate"], scene_transitions=structure_analysis["scene_transitions"], character_mentions=structure_analysis["character_mentions"], emotion_words=content_analysis["emotion_words"], overall_score=overall_score, dimension_scores=dimension_scores ) logger.info(f"质量评估完成,综合评分: {overall_score:.1f}") return metrics def _calculate_basic_stats(self, text: str) -> Dict[str, Any]: """计算基础统计信息""" # 总字符数 total_chars = len(text) # 有效字符数(中文字符+标点+数字) lines = [line.strip() for line in text.split('\n') if line.strip()] combined = ''.join(lines) chinese_chars = re.findall(r'[\u4e00-\u9fa5]', combined) punctuation = re.findall(r'[,。!?;:、""''()【】《》]', combined) numbers = re.findall(r'\d', combined) effective_chars = len(chinese_chars) + len(punctuation) + len(numbers) # 段落数 paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] paragraph_count = len(paragraphs) # 句子数 sentences = re.split(r'[。!?]', text) sentence_count = len([s for s in sentences if s.strip()]) # 对话数 dialogue_count = 0 for pattern in self.dialogue_patterns: dialogue_count += len(re.findall(pattern, text)) return { "total_chars": total_chars, "effective_chars": effective_chars, "paragraph_count": paragraph_count, "sentence_count": sentence_count, "dialogue_count": dialogue_count } def _analyze_content(self, text: str) -> Dict[str, Any]: """分析内容质量""" # 对话比例 dialogue_chars = 0 for pattern in self.dialogue_patterns: matches = re.findall(pattern, text) dialogue_chars += sum(len(match) for match in matches) dialogue_ratio = dialogue_chars / len(text) if text else 0 # 描写比例(通过描写词汇估算) description_count = 0 for word in self.description_words: description_count += text.count(word) description_ratio = min(description_count / 100, 0.5) # 归一化 # 动作比例 action_count = 0 for word in self.action_words: action_count += text.count(word) action_ratio = min(action_count / 100, 0.5) # 归一化 # 情感词汇统计 emotion_words = 0 for category, words in self.emotion_words.items(): for word in words: emotion_words += text.count(word) # 评分计算 richness_score = min(100, (dialogue_ratio * 30 + description_ratio * 40 + action_ratio * 30) * 100) emotion_score = min(100, emotion_words * 2) # 每个情感词2分 dialogue_score = min(100, dialogue_ratio * 200) # 对话比例评分 return { "dialogue_ratio": dialogue_ratio, "description_ratio": description_ratio, "action_ratio": action_ratio, "emotion_words": emotion_words, "richness_score": richness_score, "emotion_score": emotion_score, "dialogue_score": dialogue_score } def _analyze_structure(self, text: str) -> Dict[str, Any]: """分析结构质量""" # 场景转换(通过时间、地点词汇估算) scene_markers = ["突然", "接着", "然后", "后来", "此时", "这时", "同时", "与此同时", "第二天", "几天后"] scene_transitions = 0 for marker in scene_markers: scene_transitions += text.count(marker) # 角色提及统计(简单的人名识别) # 这里使用简单的启发式方法,实际应用中可以使用NER potential_names = re.findall(r'[\u4e00-\u9fa5]{2,3}(?=[,。!?:;]|说|想|看|听)', text) character_mentions = Counter(potential_names) # 只保留出现频率较高的(可能是角色名) character_mentions = {name: count for name, count in character_mentions.items() if count >= 3} # 结构评分 structure_score = min(100, scene_transitions * 5 + len(character_mentions) * 10) return { "scene_transitions": scene_transitions, "character_mentions": character_mentions, "structure_score": structure_score } def _analyze_language_quality(self, text: str) -> Dict[str, Any]: """分析语言质量""" # 句子平均长度 sentences = re.split(r'[。!?]', text) valid_sentences = [s.strip() for s in sentences if s.strip()] avg_sentence_length = sum(len(s) for s in valid_sentences) / len(valid_sentences) if valid_sentences else 0 # 词汇丰富度(简单估算) words = re.findall(r'[\u4e00-\u9fa5]+', text) unique_words = set(words) vocabulary_richness = len(unique_words) / len(words) if words else 0 # 重复率(检查重复的短语) phrases = [] for i in range(len(text) - 10): phrase = text[i:i+10] if re.match(r'^[\u4e00-\u9fa5,。!?;:、""''()【】《》]+$', phrase): phrases.append(phrase) phrase_counts = Counter(phrases) repeated_phrases = sum(1 for count in phrase_counts.values() if count > 1) repetition_rate = repeated_phrases / len(phrases) if phrases else 0 # 语言质量评分 length_score = min(100, max(0, 100 - abs(avg_sentence_length - 20) * 2)) # 理想句长20字 richness_score = vocabulary_richness * 100 repetition_score = max(0, 100 - repetition_rate * 200) language_score = (length_score + richness_score + repetition_score) / 3 return { "avg_sentence_length": avg_sentence_length, "vocabulary_richness": vocabulary_richness, "repetition_rate": repetition_rate, "language_score": language_score } def generate_improvement_suggestions(self, metrics: QualityMetrics) -> List[str]: """生成改进建议""" suggestions = [] # 基于各维度评分给出建议 if metrics.dimension_scores["内容丰富度"] < 70: suggestions.append("建议增加更多的场景描写和人物内心活动,提升内容丰富度") if metrics.dimension_scores["结构合理性"] < 70: suggestions.append("建议优化故事结构,增加场景转换和情节推进") if metrics.dimension_scores["语言质量"] < 70: suggestions.append("建议提升语言表达质量,避免重复用词,丰富词汇") if metrics.dimension_scores["情感表达"] < 70: suggestions.append("建议增强情感表达,多使用情感词汇和内心独白") if metrics.dimension_scores["对话质量"] < 70: suggestions.append("建议增加对话内容,让角色通过对话展现性格") # 基于具体指标给出建议 if metrics.dialogue_ratio < 0.2: suggestions.append("对话比例偏低,建议增加角色对话来推进情节") if metrics.avg_sentence_length > 30: suggestions.append("句子平均长度偏长,建议适当使用短句增加节奏感") if metrics.repetition_rate > 0.1: suggestions.append("存在较多重复表达,建议使用同义词替换增加表达多样性") if len(metrics.character_mentions) < 3: suggestions.append("角色数量偏少,建议增加配角丰富故事内容") return suggestions def export_quality_report(self, metrics: QualityMetrics, title: str = "", output_file: str = None) -> str: """导出质量报告""" report = f""" # 小说质量评估报告 ## 基本信息 - 标题: {title or "未命名"} - 总字符数: {metrics.total_chars:,} - 有效字符数: {metrics.effective_chars:,} - 段落数: {metrics.paragraph_count} - 句子数: {metrics.sentence_count} - 对话数: {metrics.dialogue_count} ## 综合评分: {metrics.overall_score:.1f}/100 ## 各维度评分 - 内容丰富度: {metrics.dimension_scores['内容丰富度']:.1f}/100 - 结构合理性: {metrics.dimension_scores['结构合理性']:.1f}/100 - 语言质量: {metrics.dimension_scores['语言质量']:.1f}/100 - 情感表达: {metrics.dimension_scores['情感表达']:.1f}/100 - 对话质量: {metrics.dimension_scores['对话质量']:.1f}/100 ## 详细指标 - 对话比例: {metrics.dialogue_ratio:.1%} - 描写比例: {metrics.description_ratio:.1%} - 动作比例: {metrics.action_ratio:.1%} - 平均句长: {metrics.avg_sentence_length:.1f}字 - 词汇丰富度: {metrics.vocabulary_richness:.1%} - 重复率: {metrics.repetition_rate:.1%} - 场景转换: {metrics.scene_transitions}次 - 情感词汇: {metrics.emotion_words}个 ## 角色提及统计 """ for character, count in metrics.character_mentions.items(): report += f"- {character}: {count}次\n" # 添加改进建议 suggestions = self.generate_improvement_suggestions(metrics) if suggestions: report += "\n## 改进建议\n" for i, suggestion in enumerate(suggestions, 1): report += f"{i}. {suggestion}\n" # 保存到文件 if output_file: try: with open(output_file, 'w', encoding='utf-8') as f: f.write(report) logger.info(f"质量报告已保存到: {output_file}") except Exception as e: logger.error(f"保存质量报告失败: {str(e)}") return report def assess_novel_quality(text: str, title: str = "") -> QualityMetrics: """便捷函数:评估小说质量""" assessor = NovelQualityAssessor() return assessor.assess_novel_quality(text, title) def generate_quality_report(text: str, title: str = "", output_file: str = None) -> str: """便捷函数:生成质量报告""" assessor = NovelQualityAssessor() metrics = assessor.assess_novel_quality(text, title) return assessor.export_quality_report(metrics, title, output_file) if __name__ == "__main__": # 测试代码 sample_text = """ 《测试小说》 李明走进办公室,心情复杂。今天是他入职的第一天,既兴奋又紧张。 【你好,我是新来的李明。】他对前台小姐说道。 【欢迎!我是小王,有什么需要帮助的尽管说。】小王热情地回答。 李明感到一阵温暖,看来这里的同事都很友善。他想起了母亲昨天的话:【要好好工作,不要让我们失望。】 突然,一个严肃的声音响起:【你就是新来的?跟我来。】 """ metrics = assess_novel_quality(sample_text, "测试小说") print(f"综合评分: {metrics.overall_score:.1f}") print(f"对话比例: {metrics.dialogue_ratio:.1%}") print(f"平均句长: {metrics.avg_sentence_length:.1f}")