# coding=utf-8 # Copyright 2017 The Tensor2Tensor Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # -*- coding: utf-8 -*- # Copyright 2017 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ROUGe metric implementation. This is a modified and slightly extended verison of https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import numpy as np import tensorflow as tf def _len_lcs(x, y): """Returns the length of the Longest Common Subsequence between two seqs. Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence Args: x: sequence of words y: sequence of words Returns integer: Length of LCS between x and y """ table = _lcs(x, y) n, m = len(x), len(y) return table[n, m] def _lcs(x, y): """Computes the length of the LCS between two seqs. The implementation below uses a DP programming algorithm and runs in O(nm) time where n = len(x) and m = len(y). Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence Args: x: collection of words y: collection of words Returns: Table of dictionary of coord and len lcs """ n, m = len(x), len(y) table = dict() for i in range(n + 1): for j in range(m + 1): if i == 0 or j == 0: table[i, j] = 0 elif x[i - 1] == y[j - 1]: table[i, j] = table[i - 1, j - 1] + 1 else: table[i, j] = max(table[i - 1, j], table[i, j - 1]) return table def _f_lcs(llcs, m, n): """Computes the LCS-based F-measure score. Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf Args: llcs: Length of LCS m: number of words in reference summary n: number of words in candidate summary Returns: Float. LCS-based F-measure score """ r_lcs = llcs / m p_lcs = llcs / n beta = p_lcs / (r_lcs + 1e-12) num = (1 + (beta**2)) * r_lcs * p_lcs denom = r_lcs + ((beta**2) * p_lcs) f_lcs = num / (denom + 1e-12) return f_lcs def rouge_l_sentence_level(eval_sentences, ref_sentences): """Computes ROUGE-L (sentence level) of two collections of sentences. Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ rouge-working-note-v1.3.1.pdf Calculated according to: R_lcs = LCS(X,Y)/m P_lcs = LCS(X,Y)/n F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) where: X = reference summary Y = Candidate summary m = length of reference summary n = length of candidate summary Args: eval_sentences: The sentences that have been picked by the summarizer ref_sentences: The sentences from the referene set Returns: A float: F_lcs """ f1_scores = [] for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences): m = len(ref_sentence) n = len(eval_sentence) lcs = _len_lcs(eval_sentence, ref_sentence) f1_scores.append(_f_lcs(lcs, m, n)) return np.mean(f1_scores, dtype=np.float32) def rouge_l_fscore(predictions, labels, **unused_kwargs): """ROUGE scores computation between labels and predictions. This is an approximate ROUGE scoring method since we do not glue word pieces or decode the ids and tokenize the output. Args: predictions: tensor, model predicitons labels: tensor, gold output. Returns: rouge_l_fscore: approx rouge-l f1 score. """ outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) # Convert the outputs and labels to a [batch_size, input_length] tensor. outputs = tf.squeeze(outputs, axis=[-1, -2]) labels = tf.squeeze(labels, axis=[-1, -2]) rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (labels, outputs), tf.float32) return rouge_l_f_score, tf.constant(1.0) def _get_ngrams(n, text): """Calcualtes n-grams. Args: n: which n-grams to calculate text: An array of tokens Returns: A set of n-grams """ ngram_set = set() text_length = len(text) max_index_ngram_start = text_length - n for i in range(max_index_ngram_start + 1): ngram_set.add(tuple(text[i:i + n])) return ngram_set def rouge_n(eval_sentences, ref_sentences, n=2): """Computes ROUGE-N f1 score of two text collections of sentences. Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ papers/rouge-working-note-v1.3.1.pdf Args: eval_sentences: The sentences that have been picked by the summarizer ref_sentences: The sentences from the reference set n: Size of ngram. Defaults to 2. Returns: f1 score for ROUGE-N """ f1_scores = [] for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences): eval_ngrams = _get_ngrams(n, eval_sentence) ref_ngrams = _get_ngrams(n, ref_sentence) ref_count = len(ref_ngrams) eval_count = len(eval_ngrams) # Gets the overlapping ngrams between evaluated and reference overlapping_ngrams = eval_ngrams.intersection(ref_ngrams) overlapping_count = len(overlapping_ngrams) # Handle edge case. This isn't mathematically correct, but it's good enough if eval_count == 0: precision = 0.0 else: precision = overlapping_count / eval_count if ref_count == 0: recall = 0.0 else: recall = overlapping_count / ref_count f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8))) # return overlapping_count / reference_count return np.mean(f1_scores, dtype=np.float32) def rouge_2_fscore(predictions, labels, **unused_kwargs): """ROUGE-2 F1 score computation between labels and predictions. This is an approximate ROUGE scoring method since we do not glue word pieces or decode the ids and tokenize the output. Args: predictions: tensor, model predicitons labels: tensor, gold output. Returns: rouge2_fscore: approx rouge-2 f1 score. """ outputs = tf.to_int32(tf.argmax(predictions, axis=-1)) # Convert the outputs and labels to a [batch_size, input_length] tensor. outputs = tf.squeeze(outputs, axis=[-1, -2]) labels = tf.squeeze(labels, axis=[-1, -2]) rouge_2_f_score = tf.py_func(rouge_n, (labels, outputs), tf.float32) return rouge_2_f_score, tf.constant(1.0)