ScorePP-用标准C++实现的自动分词评测程序
中文分词是自然语言处理的基础性关键问题,近一年来一直在进行着分词方面的研究。一开始用的是Sighan backoff 提供的用Perl脚本编写的分词打分程序Score。为了把用C++写的分词程序和评测程序无缝的结合在一起,同时也为了自动的分析分词中的错误原因,参考Score改写了在C++下的评测程序,我自己称它为ScorePP。为了分词很长时间都没有休息了。中午走出自动化所的食堂,不知道怎么的,突然想起来看了日历,发现生日就在今天。回想着半年的时间,天天都在跟文字打交道,天天为了提高分词的效果,绞尽脑汁。过的把时间都忘了。今年的生日又是跟往常一样,自己一个人过了。为了纪念这个重要的时间,把自己改写的分词评测程序,发布出来。如果对大家有所帮助,也算是很欣慰的一件事。也算是献给自己的生日礼物。
本分词评测程序有以下几点要注意的:
1 提供分词结果检查,即切分的数据总量要和标准的相同,不能多切,也不能少切。
2 有完善的评测指标接口:准确度、召回率、F值、未登录词比例、未登录词召回率、登录词召回率。
3 分词切分标志可以是空格,也可以是斜杠。
以下为程序源代码:
Score.h
/********************************************************************
* Copyright (C) 2012 Li Yachao
* Contact: liyc7711@gmail.com or harry_lyc@foxmail.com
*
* Permission to use, copy, modify, and distribute this software for
* any non-commercial purpose is hereby granted without fee, provided
* that the above copyright notice appear in all copies and that both
* that copyright notice.
* It is provided "as is" without express or implied warranty.
*
* Version: 0.1
* Last update: 2012-4-13
*********************************************************************/
/*********************************************************************
用于分析分词结果
*********************************************************************/
#ifndef SCORE_H
#define SCORE_H
#include <iomanip>
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
#include <string>
namespace MyUtility
{
struct ScoreItem
{
int GoldTotal;
int TestTotal;
int TestCorrect;
};
class Score
{
public:
/*如果报告、词典文件输入为空,则表示不用*/
Score(const std::string& gold_file,const std::string& test_file,
const std::string& dic_file ,const std::string& report_file="");
Score();
~Score();
void Clear();
double GetRecall();
double GetPrecise();
double GetFMeasure();
int GetTrueWords();
int GetTestWords();
double GetTestOOVRate();/*测试语料未登录词比例*/
double GetOOVRecallRate();/*未登录词的召回率*/
double GetIVRecallRate();/*登录词的召回率*/
private:
std::ofstream fout;/*输出文件流*/
//std::ofstream fout1;/*输出文件流*/
std::string reportFile;/*报告结果件路径*/
std::string goldFile;/*标准文件路径*/
std::string testFile;/*测试文件路径*/
std::string dictionaryFile;/*词典文件路径*/
int totalOOVTokens ;/*未登录tokens数量*/
int totalOOVCorrectTokens;/*正确的未登录tokens数量*/
int totalIVCorrectTokens;/*正确的登录词tokens数量*/
std::vector<std::string>goldLines;/*标准文件的文本行*/
std::vector<std::string>testLines;/*测试文件的文本行*/
std::vector<struct ScoreItem> listScore;/*每行的评测结果*/
std::set<std::string> dictionaryList;/*词典数据结构*/
/*************************************************/
bool IsPrefix(const std::string &src, const std::string &prefix);
bool Postfix(const std::string &src, const std::string &postfix);
bool Init();
bool InitDict(const std::string& filePath);/*初始化词典*/
bool IsEntryExist(const char * entry);
bool Parse(const std::vector<std::string>& gold_tokens,const std::vector<std::string>& test_tokens,struct ScoreItem& score);
bool FileReader(const std::string& path,std::vector<std::string>& lines);
void SplitByTokens(std::vector<std::string> &vecstr, const std::string &str, const std::string tokens[],constint tokensnumber, bool withtoken);
};
}
#endifScore.cpp
#include "Score.h"
namespace MyUtility
{
Score::Score()
{
}
Score::~Score()
{
dictionaryList.clear();
Clear();
if(!fout.is_open())
{
fout.clear();
fout.close();
}
}
/*
*************************************************
功能 :评测分词结果的效果。
参数 : gold_file标准的分词结果;test_file用户分词结果;dic_file词典文件;report_file报告文件
返回值 :
-------------------------------------------------
备注 :
-------------------------------------------------
作者 : Li Yachao
时间 : 2012-4-13
*************************************************
*/
Score::Score(const std::string& gold_file,const std::string& test_file,
const std::string& dic_file,const std::string& report_file)
{
goldFile = gold_file;
testFile = test_file;
dictionaryFile = dic_file ;
if(report_file.empty())
{
reportFile="";
}
else
{
reportFile = report_file;
}
Clear();
if(!Init())
{
std::cout<<"Use score "<<std::endl;
}
}
/*取得分词的准确度*/
double Score::GetPrecise()
{
double total = 0;
double total_c = 0;
for(int i=0;i<listScore.size();i++)
{
total += listScore.TestTotal;
total_c += listScore.TestCorrect;
}
if(total!= 0)
{
return total_c/total;
}
else
{
return -1;
}
}
/*取得分词的召回率*/
double Score::GetRecall()
{
double total = 0;
double total_c = 0;
for(int i=0;i<listScore.size();i++)
{
total += listScore.GoldTotal;
total_c += listScore.TestCorrect;
}
if(total!= 0)
{
return total_c/total;
}
else
{
return -1;
}
}
/*取得分词的F值*/
double Score::GetFMeasure()
{
double a = GetPrecise();
double b = GetRecall();
if((a <= 0)&& (b <= 0))
{
return -1;
}
return 2*a*b/(a+b);
}
/*取得分词的标准切分数量*/
int Score::GetTrueWords()
{
int val = 0;
for(int i=0;i<listScore.size();i++)
{
val += listScore.GoldTotal;
}
return val;
}
/*取得分词的用户切分数量*/
int Score::GetTestWords()
{
int val = 0;
for(int i=0;i<listScore.size();i++)
{
val += listScore.TestTotal;
}
return val;
}
/*测试语料未登录词比例*/
double Score::GetTestOOVRate()
{
double t = GetTrueWords();
if(t != 0)
{
return totalOOVTokens/t;
}
else
{
return -1;
}
}
/*未登录词的召回率*/
double Score:: GetOOVRecallRate()
{
if(totalOOVTokens <= 0)
{
return 0;
}
else
{
return (double)totalOOVCorrectTokens/(double)totalOOVTokens;
}
}
/*登录词的召回率*/
double Score::GetIVRecallRate()
{
double t = (GetTrueWords() - totalOOVTokens);
if(t==0)
{
return 0;
}
else
{
return (double)(totalIVCorrectTokens)/t;
}
}
void Score::Clear()
{
totalOOVTokens = 0;
totalOOVCorrectTokens = 0;
totalIVCorrectTokens = 0;
goldLines.clear();
testLines.clear();
listScore.clear();
listScore.clear();
dictionaryList.clear();
}
bool Score::Init()
{
if(!reportFile.empty())
{
fout.open(reportFile.c_str());
if(!fout.is_open())
{
return false;
}
}
if(dictionaryFile.empty())
{
return false;
}
else
{
if(!InitDict(dictionaryFile))
{
std::cout<<"Dict Empty!";
}
}
if(!FileReader(goldFile,goldLines))
{
return false;
}
if(!FileReader(testFile,testLines))
{
return false;
}
if(goldLines.size() != testLines.size())
{
std::cerr <<"File lines size fault.";
return false;
}
//fout1.open("error.txt");
int line_size = goldLines.size();
std::vector<std::string>gold_tokens;
std::vector<std::string>test_tokens;
std::string seg_flags[]={" ","/"};/*分词的切分标志可以是空格,也可以是斜杠*/
for(int i=0;i<line_size;i++)
{
SplitByTokens(gold_tokens,goldLines,seg_flags,2,false);
SplitByTokens(test_tokens,testLines,seg_flags,2,false);
struct ScoreItem score;
if(!Parse(gold_tokens,test_tokens, score))
{
std::cerr<<"Parse error at line "<<i+1<<std::endl;
}
gold_tokens.clear();
test_tokens.clear();
/*********Score评分************/
listScore.push_back(score);
/*S****************************/
}
if(!reportFile.empty())
{
fout<<"/**************************************************/"<<std::endl ;
fout<<"True Word Count :"<<GetTrueWords()<<std::endl ;
fout<<"Test Word Count :"<<GetTestWords()<<std::endl ;
fout<<std::fixed<<std::setprecision(4)<<"Precision:"<<GetPrecise()<<std::endl<<"Recall:"<<GetRecall()<<std::endl<<"F Measure:"<<GetFMeasure()<<std::endl ;
fout<<"OOV Rate : "<<GetTestOOVRate()<<std::endl ;
fout<<"OOV Recall Rate : "<<GetOOVRecallRate()<<std::endl ;
fout<<"IV Recall Rate : "<<GetIVRecallRate()<<std::endl ;
}
}
bool Score::Parse(const std::vector<std::string>& gold_tokens,const std::vector<std::string>& test_tokens, struct ScoreItem& score)
{
if(gold_tokens.size() == 0 || test_tokens.size() == 0)
{
return false;
}
//totalGoldTokens += gold_tokens.size();
/*****未登录词**************************/
for(int i=0;i<gold_tokens.size();i++)
{
if(!IsEntryExist(gold_tokens.c_str()))
{
totalOOVTokens ++;
}
}
/******************************/
std::vector<bool>gold_val;
/*****************************/
score.GoldTotal = gold_tokens.size();
score.TestCorrect = test_tokens.size();
score.TestTotal = test_tokens.size();
/*****************************/
size_t length_gold = 0;
size_t length_test = 0;
size_t gold_last_success = 0;
size_t gold_cur_pos = 0;
size_t gold_cur_prior = 0;
size_t test_last_success = 0;
size_t test_cur_pos = 0;
size_t test_cur_prior = 0;
std::string buffer_gold = gold_tokens;
std::string buffer_test = test_tokens;
std::string str_out ="";
for(int i=0;i<gold_tokens.size();i++)
{
length_gold += gold_tokens.length();
}
for(int i=0;i<test_tokens.size();i++)
{
length_test += test_tokens.length();
}
if(length_gold != length_test)
{
return false;/*切分有误,丢失了数据*/
}
if(!reportFile.empty())
{
fout << gold_tokens;
fout <<"\t";
}
if(gold_tokens!= test_tokens)
{
score.TestCorrect --;
gold_val.push_back(false);
if(!reportFile.empty())
{
fout <<"|"<<test_tokens;
}
}
else
{
if(!reportFile.empty())
{
fout <<test_tokens;
}
//if((NULL != dic) && (dic->IsEntryExist(test_tokens.c_str())))
//{
// //totalOOVCorrectTokens ++;
//}
gold_val.push_back(true);
}
if(!reportFile.empty())
{
fout<<std::endl ;
}
while(true)
{
if((gold_last_success < gold_cur_pos) && (gold_cur_prior < gold_cur_pos ))
{
buffer_gold += gold_tokens;
gold_cur_prior++;
if(!reportFile.empty())
{
fout <<gold_tokens<<"\t<"<<std::endl;
}
gold_val.push_back(false);
}
if((test_last_success < test_cur_pos) && (test_cur_prior < test_cur_pos ))
{
buffer_test += test_tokens;
test_cur_prior++;
if(!reportFile.empty())
{
fout <<"\t>"<< test_tokens<<std::endl;
}
}
if((buffer_gold.length () == buffer_test.length()) && (buffer_gold == buffer_test))
{
/************************************/
gold_cur_pos ++;
gold_last_success = gold_cur_pos;
gold_cur_prior = gold_cur_pos;
/************************************/
test_cur_pos ++;
test_last_success = test_cur_pos;
test_cur_prior = test_cur_pos;
/************************************/
if(gold_cur_pos < gold_tokens.size())
{
buffer_gold = gold_tokens ;
if(!reportFile.empty())
{
fout << gold_tokens;
fout <<"\t";
}
}
if(test_cur_pos < test_tokens.size())
{
buffer_test = test_tokens ;
if(buffer_gold != buffer_test)
{
score.TestCorrect --;
}
else
{
//if((NULL != dic) && (dic->IsEntryExist(test_tokens.c_str())))
//{
// //totalOOVCorrectTokens ++;
//}
}
if(!reportFile.empty())
{
if(gold_tokens != test_tokens)
{
fout <<"|"<<test_tokens;
gold_val.push_back(false);
}
else
{
fout <<test_tokens;
gold_val.push_back(true);
}
fout <<std::endl ;
}
}
if((gold_cur_pos >= gold_tokens.size()) && (test_cur_pos >= test_tokens.size()))
{
break;
}
continue;
}
else if(buffer_gold.length () < buffer_test.length())
{
/********************************************/
gold_cur_prior = gold_cur_pos;
gold_cur_pos ++;
}
else if(buffer_gold.length () > buffer_test.length())
{
/********************************************/
//fout<<test_tokens<<std::endl;
score.TestCorrect --;
test_cur_prior = test_cur_pos;
test_cur_pos ++;
}
}
if(fout.is_open())
{
double precise = (double)score.TestCorrect/ (double)score.TestTotal;
double recall = (double)score.TestCorrect / (double)score.GoldTotal;
double f = 0;
if((precise!=0) || (recall != 0))
{
f = 2*precise*recall/(recall+precise);
}
fout<<std::fixed<<std::setprecision(4)<<"Precision:"<<precise<<" Recall:"<<recall<<" F Measure:"<<f<<std::endl ;
}
/**************************************************/
if(gold_tokens.size() != gold_val.size())
{
return false;
}
for(int i=0;i<gold_tokens.size();i++)
{
if(gold_val)
{
if(IsEntryExist(gold_tokens.c_str()))
{
totalIVCorrectTokens++;
}
else
{
totalOOVCorrectTokens++;
}
}
else
{
//fout1<<gold_tokens<<std::endl;
}
}
//fout1<<std::endl ;
//fout1.clear();
return true;
}
bool Score::IsEntryExist(const char * entry)
{
bool val = false;
if(NULL == entry)
{
return val;
}
std::set<std::string>::iterator pos;
pos = dictionaryList.find(entry);
if(pos != dictionaryList.end())
{
val = true;
}
else
{
/*处理藏文词条中后面有音节点的情况*/
/* if(Postfix(entry,"\xe0\xbc\x8b"))
{
tmp = tmp.substr(0,tmp.size() - 3);
}
else
{
tmp += "\xe0\xbc\x8b";
}*/
pos = dictionaryList.find(entry);
if(pos != dictionaryList.end())
{
val = true;
}
}
return val;
}
/*初始化词典*/
bool Score::InitDict(const std::string& filePath)
{
std::vector<std::string>lines;
if(!FileReader(filePath,lines))
{
//std::cerr << "Open Dict [" << filePath << "] error!" << std::endl;
return false;
}
for(int i=0;i<lines.size();i++)
{
dictionaryList.insert(lines);
}
lines.clear();
return true;
}
bool Score::FileReader(const std::string& path,std::vector<std::string>& lines)
{
std::ifstream fin;
fin.open(path.c_str());
if(!fin.is_open())
{
std::cerr << "Open [" << path << "] error!" << std::endl;
return false;
}
lines.clear();
std::string utf8flag = "\xef\xbb\xbf";
std::string utf16flag = "\xff\xfe";
std::string myLine = "";
int lineIndex = 1;
while (getline(fin, myLine))
{
if(lineIndex == 1)
{
if(IsPrefix(myLine,utf8flag))
{
myLine = myLine.substr(3,myLine.length() - 3);
}
else if(IsPrefix(myLine,utf16flag))
{
std::cerr <<"["<<path<<"], File type error,need UTF8 or ANSI file."<<std::endl;
return false;
}
}
lines.push_back(myLine);
myLine.clear();
lineIndex++;
//std::cout<<lineIndex <<"\t";
}
fin.close();
return true;
}
bool Score::Postfix(const std::string &src, const std::string &postfix)
{
size_t s1=postfix.size();
size_t s2=src.size();
if( s1>s2 )
{
return false;
}
int i=(int)postfix.size()-1;
int j=(int)src.size()-1;
while( i>=0 && src==postfix)
{
--i, --j;
}
return (i == -1);
}
bool Score::IsPrefix(const std::string &src, const std::string &prefix)
{
size_t s1=prefix.size();
size_t s2=src.size();
if(s1 == 0)
{
return true ;
}
if( s1>s2 )
{
return false;
}
size_t i=0;
while( i<s1 && src==prefix)
{
++i;
}
return (i == s1);
}
void Score::SplitByTokens(std::vector<std::string> &vecstr, const std::string &str, const std::string tokens[],constint tokensnumber, bool withtoken)
{
vecstr.clear();
if((str.empty()) ||tokensnumber <=0 )
{
return;
}
std::string buffer="";
int textLength = str.length();
int start = 0;
int offset = 0;
while(start < textLength)
{
offset = textLength;
int subLength =0;
std::string tmp ="";
for(int i=0;i< tokensnumber;i++)
{
if(tokens.empty())
{
continue;
}
int curr = str.find(tokens,start);
if((curr >= 0) &&(curr < offset))
{
offset = curr;
subLength= tokens.length();
}
}
if(start == offset)
{
tmp = str.substr(start,0);
start = offset + subLength;
}
else if(start < offset)
{
int len = 0;
if(withtoken)
{
len = subLength + ( offset - start);
}
else
{
len = ( offset - start);
}
tmp = str.substr(start,len);
start = (offset + subLength);
}
/*这个影响多个空格连在一块,并且切分标志位空格的情况*/
if(!tmp.empty())
{
vecstr.push_back(tmp);
}
}
}
}
作者:Harry_lyc 发表于2012-4-13 17:05:58 原文链接
页:
[1]