
x33g5p2x  于2022-01-30 转载在 其他  



[英]Get the position of token that corresponds to the character offset that is passed as a parameter. This function could be useful when dealing with corpora that specify annotation in terms of character offsets. In particular, the CuratorClient uses this function to convert views from the Curator representation. NOTE: one-past-the-end indexing can make this problematic. Currently, constituents are processed so that only characters within tokens are mapped to token ids (avoiding ambiguity at the cost of introducing complexity for users thinking of one-past-the-end indexing). I.E. you MUST modify the end offset in the call if you are using one-past-the-end offsets. (example: curator data structures use one-past-the- end, as do TextAnnotation Views/Constituents. This behavior was chosen to handle the case where there is arbitrary whitespace, and to avoid confusion when two tokens are contiguous (the first character of the second token would conflict with the last (one-past-the-end) character of the first. UPDATED to allow non-zero first token character offset (i.e. in case where source text has markup preamble that you want to ignore. Current implementation maps char offsets not representing tokens to the index '-1'.


代码示例来源:origin: edu.illinois.cs.cogcomp/wikipediaAPI

 * Ignores the bug in pre-computing token offsets
 * @param ta
private static void validateTextAnnotationOffset(TextAnnotation ta){
  }catch(Exception e){

代码示例来源:origin: edu.illinois.cs.cogcomp/wikipediaAPI-multilingual

 * Ignores the bug in pre-computing token offsets
 * @param ta
private static void validateTextAnnotationOffset(TextAnnotation ta) {
  try {
  } catch (Exception e) {

代码示例来源:origin: CogComp/cogcomp-nlp

protected static Constituent getNewConstituentForSpan(String label, String viewName,
    TextAnnotation ta, Span span) {
  int start = ta.getTokenIdFromCharacterOffset(span.getStart());
  int end = ta.getTokenIdFromCharacterOffset(span.getEnding() - 1) + 1;
  Constituent constituent = new Constituent(label, viewName, ta, start, end);
  if (span.isSetAttributes()) {
    copyAttributesToConstituent(span, constituent);
  return constituent;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-curator

protected static Constituent getNewConstituentForSpan(String label, String viewName,
    TextAnnotation ta, Span span) {
  int start = ta.getTokenIdFromCharacterOffset(span.getStart());
  int end = ta.getTokenIdFromCharacterOffset(span.getEnding() - 1) + 1;
  Constituent constituent = new Constituent(label, viewName, ta, start, end);
  if (span.isSetAttributes()) {
    copyAttributesToConstituent(span, constituent);
  return constituent;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-caching-curator

protected static Constituent getNewConstituentForSpan(String label, String viewName, TextAnnotation ta, Span span) {
  int start = ta.getTokenIdFromCharacterOffset(span.getStart());
  int end = ta.getTokenIdFromCharacterOffset(span.getEnding() - 1) + 1;
  Constituent constituent = new Constituent(label, viewName, ta, start, end);
  if (span.isSetAttributes()) {
    copyAttributesToConstituent(span, constituent);
  return constituent;

代码示例来源:origin: CogComp/cogcomp-nlp

public void addView(TextAnnotation ta) throws AnnotatorException {
  assert (ta.hasView(ViewNames.SENTENCE));
  SpanLabelView quantifierView =
      new SpanLabelView(ViewNames.QUANTITIES, "illinois-quantifier", ta, 1d);
  List<QuantSpan> quantSpans = getSpans(ta.getTokenizedText(), true, ta);
  for (QuantSpan span : quantSpans) {
    int startToken = ta.getTokenIdFromCharacterOffset(span.start);
    int endToken = ta.getTokenIdFromCharacterOffset(span.end);
    quantifierView.addSpanLabel(startToken, endToken, span.object.toString(), 1d);
  ta.addView(ViewNames.QUANTITIES, quantifierView);

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-quantifier

public void addView(TextAnnotation ta) throws AnnotatorException {
  assert (ta.hasView(ViewNames.SENTENCE));
  SpanLabelView quantifierView =
      new SpanLabelView(ViewNames.QUANTITIES, "illinois-quantifier", ta, 1d);
  List<QuantSpan> quantSpans = getSpans(ta.getTokenizedText(), true, ta);
  for (QuantSpan span : quantSpans) {
    int startToken = ta.getTokenIdFromCharacterOffset(span.start);
    int endToken = ta.getTokenIdFromCharacterOffset(span.end);
    quantifierView.addSpanLabel(startToken, endToken, span.object.toString(), 1d);
  ta.addView(ViewNames.QUANTITIES, quantifierView);

代码示例来源:origin: CogComp/cogcomp-nlp

 * Gets the token index of a Stanford dependency node relative to the current sentence
 * @param ta The TextAnnotation containing the sentences
 * @param node The Stanford Dependency node
 * @param sentId The sentence number
 * @return The token index relative to sentence
private int getNodePosition(TextAnnotation ta, IndexedWord node, int sentId) {
  int sentenceStart =
  int nodeCharacterOffset = node.beginPosition();
  int tokenStartSpan = ta.getTokenIdFromCharacterOffset(nodeCharacterOffset);
  return tokenStartSpan - sentenceStart;

代码示例来源:origin: edu.illinois.cs.cogcomp/stanford_3.3.1

 * Gets the token index of a Stanford dependency node relative to the current sentence
 * @param ta The TextAnnotation containing the sentences
 * @param node The Stanford Dependency node
 * @param sentId The sentence number
 * @return The token index relative to sentence
private int getNodePosition(TextAnnotation ta, IndexedWord node, int sentId) {
  int sentenceStart =
  int nodeCharacterOffset = node.beginPosition();
  int tokenStartSpan = ta.getTokenIdFromCharacterOffset(nodeCharacterOffset);
  return tokenStartSpan - sentenceStart;

代码示例来源:origin: CogComp/cogcomp-nlp

 * Helper function to create a head constituent from an extent constituent.
public static Constituent getEntityHeadForConstituent(Constituent extentConstituent,
                            TextAnnotation textAnnotation,
                            String viewName) {
  int startCharOffset =
  int endCharOffset =
      Integer.parseInt(extentConstituent.getAttribute(ACEReader.EntityHeadEndCharOffset)) - 1;
  int startToken = textAnnotation.getTokenIdFromCharacterOffset(startCharOffset);
  int endToken = textAnnotation.getTokenIdFromCharacterOffset(endCharOffset);
  if (startToken >= 0 && endToken >= 0 && !(endToken - startToken < 0)) {
    Constituent cons =
        new Constituent(extentConstituent.getLabel(), 1.0, viewName, textAnnotation,
            startToken, endToken + 1);
    for (String attributeKey : extentConstituent.getAttributeKeys()) {
      cons.addAttribute(attributeKey, extentConstituent.getAttribute(attributeKey));
    return cons;
  return null;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-corpusreaders

 * Helper function to create a head constituent from an extent constituent.
public static Constituent getEntityHeadForConstituent(Constituent extentConstituent,
                            TextAnnotation textAnnotation,
                            String viewName) {
  int startCharOffset =
  int endCharOffset =
      Integer.parseInt(extentConstituent.getAttribute(ACEReader.EntityHeadEndCharOffset)) - 1;
  int startToken = textAnnotation.getTokenIdFromCharacterOffset(startCharOffset);
  int endToken = textAnnotation.getTokenIdFromCharacterOffset(endCharOffset);
  if (startToken >= 0 && endToken >= 0 && !(endToken - startToken < 0)) {
    Constituent cons =
        new Constituent(extentConstituent.getLabel(), 1.0, viewName, textAnnotation,
            startToken, endToken + 1);
    for (String attributeKey : extentConstituent.getAttributeKeys()) {
      cons.addAttribute(attributeKey, extentConstituent.getAttribute(attributeKey));
    return cons;
  return null;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-curator

int topTokenId = ta.getTokenIdFromCharacterOffset(topNode.getSpan().getStart());
    int childTokenId = ta.getTokenIdFromCharacterOffset(childNode.getSpan().getStart());

代码示例来源:origin: edu.illinois.cs.cogcomp/md

Integer.parseInt(extentConstituent.getAttribute(ACEReader.EntityHeadEndCharOffset)) - 1;
int startToken = textAnnotation.getTokenIdFromCharacterOffset(startCharOffset);
int endToken = textAnnotation.getTokenIdFromCharacterOffset(endCharOffset);

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-caching-curator

 * Aligns a {@link Labeling} to a {@link TokenLabelView}.
 * @return A TokenLabelView
public static TokenLabelView alignLabelingToTokenLabelView(String viewName, TextAnnotation ta, Labeling labeling) {
  List<Span> labels = labeling.getLabels();
  double score = labeling.getScore();
  String generator = labeling.getSource();
  TokenLabelView view = new TokenLabelView(viewName, generator, ta, score);
  for (Span span : labels) {
    int tokenId = ta.getTokenIdFromCharacterOffset(span.getStart());
    int endTokenId = ta.getTokenIdFromCharacterOffset(span.getEnding());
    if (tokenId == endTokenId)
    for (int i = tokenId; i < endTokenId; i++) {
      view.addTokenLabel(i, span.getLabel(), span.getScore());
      if (span.isSetAttributes() && span.getAttributes().size() > 0) {
        Constituent c = view.getConstituentAtToken(i);
        copyAttributesToConstituent(span, c);
  return view;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-caching-curator

public static TreeView alignForestToDependencyView(String viewName, TextAnnotation ta, Forest dep) {
  TreeView view = new TreeView(viewName, dep.getSource(), ta, 0.0d);
  for (edu.illinois.cs.cogcomp.thrift.base.Tree tree : dep.getTrees()) {
    int topId = tree.getTop();
    List<Node> nodes = tree.getNodes();
    int topTokenStart = nodes.get(topId).getSpan().getStart();
    int topTokenId = ta.getTokenIdFromCharacterOffset(topTokenStart);
    int sentenceId = ta.getSentenceId(topTokenId);
    Tree<Pair<String, Integer>> dependencyTree = makeDependencyTree(ta, tree);
    double score = tree.getScore();
    view.setDependencyTree(sentenceId, dependencyTree, score);
  return view;

代码示例来源:origin: CogComp/cogcomp-nlp

 * Aligns a {@link edu.illinois.cs.cogcomp.thrift.base.Labeling} to a
 * {@link edu.illinois.cs.cogcomp.core.datastructures.textannotation.TokenLabelView}.
 * <b>NOTE:</b> must correct for one-past-the-end labeling when calling
 * {@link TextAnnotation#getTokenIdFromCharacterOffset(int)}.
 * @return A TokenLabelView
public static TokenLabelView alignLabelingToTokenLabelView(String viewName, TextAnnotation ta,
    Labeling labeling) {
  List<Span> labels = labeling.getLabels();
  double score = labeling.getScore();
  String generator = labeling.getSource();
  TokenLabelView view = new TokenLabelView(viewName, generator, ta, score);
  for (Span span : labels) {
    int tokenId = ta.getTokenIdFromCharacterOffset(span.getStart());
    int endTokenId = ta.getTokenIdFromCharacterOffset(span.getEnding() - 1);
    if (tokenId == endTokenId)
    for (int i = tokenId; i < endTokenId; i++) {
      view.addTokenLabel(i, span.getLabel(), span.getScore());
      if (span.isSetAttributes() && span.getAttributes().size() > 0) {
        Constituent c = view.getConstituentAtToken(i);
        copyAttributesToConstituent(span, c);
  return view;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-curator

 * Aligns a {@link edu.illinois.cs.cogcomp.thrift.base.Labeling} to a
 * {@link edu.illinois.cs.cogcomp.core.datastructures.textannotation.TokenLabelView}.
 * <b>NOTE:</b> must correct for one-past-the-end labeling when calling
 * {@link TextAnnotation#getTokenIdFromCharacterOffset(int)}.
 * @return A TokenLabelView
public static TokenLabelView alignLabelingToTokenLabelView(String viewName, TextAnnotation ta,
    Labeling labeling) {
  List<Span> labels = labeling.getLabels();
  double score = labeling.getScore();
  String generator = labeling.getSource();
  TokenLabelView view = new TokenLabelView(viewName, generator, ta, score);
  for (Span span : labels) {
    int tokenId = ta.getTokenIdFromCharacterOffset(span.getStart());
    int endTokenId = ta.getTokenIdFromCharacterOffset(span.getEnding() - 1);
    if (tokenId == endTokenId)
    for (int i = tokenId; i < endTokenId; i++) {
      view.addTokenLabel(i, span.getLabel(), span.getScore());
      if (span.isSetAttributes() && span.getAttributes().size() > 0) {
        Constituent c = view.getConstituentAtToken(i);
        copyAttributesToConstituent(span, c);
  return view;

代码示例来源:origin: CogComp/cogcomp-nlp

public static TreeView alignForestToDependencyView(String viewName, TextAnnotation ta,
    Forest dep) {
  TreeView view = new TreeView(viewName, dep.getSource(), ta, 0.0d);
  for (edu.illinois.cs.cogcomp.thrift.base.Tree tree : dep.getTrees()) {
    int topId = tree.getTop();
    List<Node> nodes = tree.getNodes();
    int topTokenStart = nodes.get(topId).getSpan().getStart();
    int topTokenId = ta.getTokenIdFromCharacterOffset(topTokenStart);
    int sentenceId = ta.getSentenceId(topTokenId);
    Tree<Pair<String, Integer>> dependencyTree = makeDependencyTree(ta, tree);
    double score = tree.getScore();
    view.setDependencyTree(sentenceId, dependencyTree, score);
  return view;

代码示例来源:origin: edu.illinois.cs.cogcomp/illinois-curator

public static TreeView alignForestToDependencyView(String viewName, TextAnnotation ta,
    Forest dep) {
  TreeView view = new TreeView(viewName, dep.getSource(), ta, 0.0d);
  for (edu.illinois.cs.cogcomp.thrift.base.Tree tree : dep.getTrees()) {
    int topId = tree.getTop();
    List<Node> nodes = tree.getNodes();
    int topTokenStart = nodes.get(topId).getSpan().getStart();
    int topTokenId = ta.getTokenIdFromCharacterOffset(topTokenStart);
    int sentenceId = ta.getSentenceId(topTokenId);
    Tree<Pair<String, Integer>> dependencyTree = makeDependencyTree(ta, tree);
    double score = tree.getScore();
    view.setDependencyTree(sentenceId, dependencyTree, score);
  return view;

代码示例来源:origin: CogComp/cogcomp-nlp

int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1); // StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1); //constituent token indexing uses one-past-the-end
