lucene怎么使用nlpir进行分词

2025-05-17 21:30:43
推荐回答(2个)
回答(1):

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
using Lucene.Net.Analysis;

namespace Lucene.Net.Analysis.DChinese
{
[StructLayout(LayoutKind.Explicit)]
public struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos1;
[FieldOffset(12)]
public int sPos2;
[FieldOffset(16)]
public int sPos3;
[FieldOffset(20)]
public int sPos4;
[FieldOffset(24)]
public int sPos5;
[FieldOffset(28)]
public int sPos

回答(2):

  词法分析是lucene的一个模块,lucene自带的中文分词器(analyzer)一般效果不是很理想。现在项目中用的分词工具是北理工的NLPIR,但是NLPIR没有一个现成的lucene分词器(analyzer)实现类。这里就需要自己来写一个比较简短的基于NLPIR的analyzer实现类。
  不同的Analyzer就是组合不同的Tokenizer和TokenFilter得到最后的TokenStream。以StandardAnalyzer为例。以下代码截图来自StandardAnalyzer源码。红框中可以看到TokenFilter使用的是装饰者模式,Tokenizer和TokenFilter都继承自TokenStream类。
  import com.sun.jna.Library;
  import com.sun.jna.Native;
  
  public class NLPTool {
  
  public interface CLibrary extends Library {
  // 定义并初始化接口的静态变量
  CLibrary Instance = (CLibrary) Native.loadLibrary("code/NLPIR",
  CLibrary.class);
  public int NLPIR_Init(String sDataPath, int encoding,
  String sLicenceCode);
  public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged);
  public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit,
  boolean bWeightOut);
  public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit,
  boolean bWeightOut);
  public int NLPIR_AddUserWord(String sWord);// add by qp 2008.11.10
  public int NLPIR_DelUsrWord(String sWord);// add by qp 2008.11.10
  public String NLPIR_GetLastErrorMsg();
  public void NLPIR_Exit();
  }
  
  public static String SegAndPos(String sInput,int type) {
  String argu = ".";
  String nativeBytes = "";
  int charset_type = 1;
  CLibrary.Instance.NLPIR_Init(argu, charset_type, "0");
  try {
  // CLibrary.Instance.NLPIR_AddUserWord("奇虎360 nt");
  nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, type); //第二个参数为1表示,进行词性标注。
  CLibrary.Instance.NLPIR_Exit();
  } catch (Exception ex) {
  ex.printStackTrace();
  }
  return nativeBytes;
  }
  }

  [java] view plain copy
  import java.io.Reader;
  import org.apache.lucene.analysis.util.CharTokenizer;
  import org.apache.lucene.util.Version;
  
  /*
  *@author:xyd
  *@department:CasCeep
  */
  public class MyChineseTokenizer extends CharTokenizer {
  
  public MyChineseTokenizer(Reader in) {
  super(Version.LUCENE_47, in);
  }
  
  public MyChineseTokenizer(AttributeFactory factory, Reader in) {
  super(Version.LUCENE_47, in);
  }
  
  /*
  * @see org.apache.lucene.analysis.util.CharTokenizer#isTokenChar(int)
  */
  @Override
  protected boolean isTokenChar(int c) {
  return !Character.isWhitespace(c);
  }
  }

  [java] view plain copy
  import java.io.BufferedReader;
  import java.io.IOException;
  import java.io.Reader;
  import java.io.StringReader;
  import lucene.NLPTool;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.analysis.Tokenizer;
  import org.apache.lucene.analysis.core.LowerCaseFilter;
  import org.apache.lucene.analysis.core.StopFilter;
  import org.apache.lucene.analysis.standard.StandardFilter;
  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
  import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  import org.apache.lucene.analysis.util.CharArraySet;
  import org.apache.lucene.util.Version;
  
  /*
  *@author:xyd
  *@department:CasCeep
  */
  public final class MyChineseAnalyzer extends Analyzer {
  private CharArraySet stopWords;
  /**
  * An array containing some common English words that are not usually useful
  * for searching.
  */
  public static final String[] CHINESE_ENGLISH_STOP_WORDS = {"我", "的" };
  
  /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
  public MyChineseAnalyzer() {
  this.stopWords = StopFilter.makeStopSet(Version.LUCENE_47,
  CHINESE_ENGLISH_STOP_WORDS);
  
  }
  
  @Override
  protected TokenStreamComponents createComponents(String arg0, Reader reader) {
  // TODO Auto-generated method stub
  BufferedReader br = new BufferedReader(reader);
  Tokenizer tokenizer = null;
  TokenStream tokFilter = null;
  try {
  String text = br.readLine();
  String string = NLPTool.SegAndPos(text, 0);
  // 分词中间加入了空格
  tokenizer = new MyChineseTokenizer(new StringReader(string));
  tokFilter = new StandardFilter(Version.LUCENE_47, tokenizer);
  tokFilter = new LowerCaseFilter(Version.LUCENE_47, tokFilter);
  // 使用stopWords进行过滤
  tokFilter = new StopFilter(Version.LUCENE_47, tokFilter, stopWords);
  } catch (IOException e) {
  e.printStackTrace();
  }
  return new TokenStreamComponents(tokenizer, tokFilter);
  }
  
  public static void main(String[] args) throws IOException {
  String string = "我的老师在中国科学院工作";
  Analyzer analyzer = new MyChineseAnalyzer();
  TokenStream tokenStream = analyzer.tokenStream("field",
  new StringReader(string));
  tokenStream.reset();
  while (tokenStream.incrementToken()) {
  // 文本属性
  CharTermAttribute attribute = tokenStream
  .addAttribute(CharTermAttribute.class);
  // 偏移量
  OffsetAttribute offsetAtt = tokenStream
  .addAttribute(OffsetAttribute.class);
  // 距离增加量
  PositionIncrementAttribute positionAttr = tokenStream
  .addAttribute(PositionIncrementAttribute.class);
  // 距离
  PositionLengthAttribute posL = tokenStream
  .addAttribute(PositionLengthAttribute.class);
  // 词性
  TypeAttribute typeAttr = tokenStream
  .addAttribute(TypeAttribute.class);
  System.out.println(offsetAtt.startOffset() + ":"
  + offsetAtt.endOffset() + "\t" + attribute + "\t"
  + typeAttr.type() + "\t" + posL.getPositionLength() + "\t"
  + positionAttr.getPositionIncrement());
  }
  }