
x33g5p2x  于2022-02-03 转载在 其他  





代码示例来源:origin: stanfordnlp/CoreNLP

 * Returns a "pretty" version of the words in this Document suitable for
 * display. The default implementation returns each of the words in
 * this Document separated
 * by spaces. Specifically, each element that implements {@link HasWord}
 * has its
 * {@link HasWord#word} printed, and other elements are skipped.
 * Subclasses that maintain additional information may which to
 * override this method.
public String presentableText() {
 StringBuilder sb = new StringBuilder();
 for (Word cur : this) {
  if (sb.length() > 0) {
   sb.append(' ');
 return (sb.toString());

代码示例来源:origin: stanfordnlp/CoreNLP

public static String tokensToString(Word [] tokens) {
 StringBuilder  sb = new StringBuilder(512);
 for(int i = 0; i < tokens.length; i ++){
  if(i > 0) sb.append(" ");
  Word l = tokens[i];
  sb.append(l.word() + "{" + l.beginPosition() + ", " + l.endPosition() + "}");
 return sb.toString();

代码示例来源:origin: stanfordnlp/CoreNLP

 * Stems <code>w</code> and returns stemmed <code>Word</code>.
public Word stem(Word w) {
 return (new Word(stem(w.word())));

代码示例来源:origin: stanfordnlp/CoreNLP

boolean justInsertedNewline = false; // to prevent contiguous newlines
for (Word w : in) {
 String ws = w.word();
 if (ws.startsWith("<") && ws.endsWith(">")) {
  if (markLineBreaks && !justInsertedNewline) {

代码示例来源:origin: stanfordnlp/CoreNLP

/** Return the tokens using PTB tokenizer.
 *  @param str String to tokenize
 *  @return List of tokens
private String[] ptbTokenize(String str) {
 // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers
 if (ptbFactory==null) {
  ptbFactory = PTBTokenizer.factory();
 Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str));
 List<Word> words = tokenizer.tokenize();
 String[] res = new String[words.size()];
 for (int i = 0, sz = words.size(); i < sz; i++) {
  res[i] = words.get(i).word();
 return res;

代码示例来源:origin: stanfordnlp/CoreNLP

Matcher hasArabic = utf8ArabicChart.matcher(token.word());
if(hasArabic.find()) {
 token.setWord(, token.word()));

代码示例来源:origin: stanfordnlp/CoreNLP

 * Test program for demonstrating the Stemmer.  It reads text from a
 * a list of files, stems each word, and writes the result to standard
 * output. Note that the word stemmed is expected to be in lower case:
 * forcing lower case must be done outside the Stemmer class.
 * Usage: Stemmer file-name file-name ...
public static void main(String[] args) throws IOException {
 Stemmer s = new Stemmer();
 if (args[0].equals("-file")) {
  Iterator<Word> it = PTBTokenizer.newPTBTokenizer(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  while (it.hasNext()) {
   Word token =;
   System.out.print(' ');
 } else {
  for (String arg : args) {
   System.out.print(' ');

代码示例来源:origin: stanfordnlp/CoreNLP

int numAdded = 0;
while (tok.hasNext()) {
 String s =;

代码示例来源:origin: stanfordnlp/CoreNLP

DFSAState<Word, Integer> fromState = tr.getSource();
Word word = tr.getInput();
if (!word.word().equals(" "))
 segmentedWords.add(0, word);
i = fromState.stateID();

代码示例来源:origin: stanfordnlp/CoreNLP

for (; ;) {
 Word word = (Word);
 if (sentIter.hasNext()) {
  pw.print(" ");


import edu.stanford.nlp.ling.Word;

List<Word> words = ...
for (Word word : words) {
 if (word.word().equals(args(1))) {

代码示例来源:origin: pilehvar/ADW

public List<String> tokenizeString(String string)
  final List<String> tokens = new ArrayList<String>();
  for (Word w : tokenize(string))
  return tokens;

代码示例来源:origin: edu.stanford.nlp/stanford-corenlp

 * Returns a "pretty" version of the words in this Document suitable for
 * display. The default implementation returns each of the words in
 * this Document separated
 * by spaces. Specifically, each element that implements {@link HasWord}
 * has its
 * {@link HasWord#word} printed, and other elements are skipped.
 * Subclasses that maintain additional information may which to
 * override this method.
public String presentableText() {
 StringBuilder sb = new StringBuilder();
 for (Word cur : this) {
  if (sb.length() > 0) {
   sb.append(' ');
 return (sb.toString());

代码示例来源:origin: edu.stanford.nlp/corenlp

 * Returns a "pretty" version of the words in this Document suitable for
 * display. The default implementation returns each of the words in
 * this Document separated
 * by spaces. Specifically, each element that implements {@link HasWord}
 * has its
 * {@link HasWord#word} printed, and other elements are skipped.
 * <p/>
 * <p>Subclasses that maintain additional information may which to
 * override this method.</p>
public String presentableText() {
 StringBuilder sb = new StringBuilder();
 for (Word cur : this) {
  if (sb.length() > 0) {
   sb.append(' ');
 return (sb.toString());

代码示例来源:origin: com.googlecode.mate-tools/srl

public String[] tokenize(String sentence) {
  Reader r=new StringReader(sentence);
  PTBTokenizer<Word> tokenizer=PTBTokenizer.newPTBTokenizer(r);
  List<String> l=new ArrayList<String>();
  String[] tok=new String[l.size()+1];
  int i=1;
  for(String s:l)
  return tok;

代码示例来源:origin: microth/PathLSTM

public String[] tokenize(String sentence) {
  Reader r = new StringReader(sentence);
  PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
  List<String> l = new ArrayList<>();
  while (tokenizer.hasNext()) {
    Word w =;
  String[] tok = new String[l.size() + 1];
  tok[0] =;
  int i = 1;
  for (String s : l)
    tok[i++] = s;
  return tok;

代码示例来源:origin: edu.stanford.nlp/stanford-corenlp

/** Return the tokens using PTB tokenizer.
 *  @param str String to tokenize
 *  @return List of tokens
private String[] ptbTokenize(String str) {
 // todo [cdm 2017]: Someday should generalize this to allow use of other tokenizers
 if (ptbFactory==null) {
  ptbFactory = PTBTokenizer.factory();
 Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(str));
 List<Word> words = tokenizer.tokenize();
 String[] res = new String[words.size()];
 for (int i = 0, sz = words.size(); i < sz; i++) {
  res[i] = words.get(i).word();
 return res;

代码示例来源:origin: edu.stanford.nlp/corenlp

protected Word getNext() {
 while (wordIter == null || ! wordIter.hasNext()) {
  if ( ! tok.hasNext()) {
   return null;
  String s =;
  if (s == null) {
   return null;
  ArrayList<Word> se = segmentWords(s);
  wordIter = se.iterator();

代码示例来源:origin: microth/PathLSTM

  public StringInText[] tokenizeplus(String sentence) {
    Reader r = new StringReader(sentence);
    List<StringInText> l = new ArrayList<>();
    for (String s : tokenize(sentence)) {
      Word w = new Word(s);
      l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
          .endPosition() + startpos));
    StringInText[] tok = new StringInText[l.size()];
    // tok[0]=new StringInText(,0,0);
    int i = 0;
    for (StringInText s : l)
      tok[i++] = s;

    startpos += (1 + sentence.length());

    return tok;

代码示例来源:origin: microth/PathLSTM

public StringInText[] tokenizeplus(String sentence) {
  Reader r = new StringReader(sentence);
  PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
  List<StringInText> l = new ArrayList<>();
  while (tokenizer.hasNext()) {
    Word w =;
    l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
        .endPosition() + startpos));
  StringInText[] tok = new StringInText[l.size() + 1];
  tok[0] = new StringInText(, 0, 0);
  int i = 1;
  for (StringInText s : l)
    tok[i++] = s;
  startpos += (1 + sentence.length());
  return tok;
