Source code for utils.eval_metrics

"""
File for creating metric functions
"""
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import recall_score as class_recall_score
from seqeval.metrics import f1_score as seq_f1
from seqeval.metrics import precision_score, recall_score

[docs]def classification_accuracy(yTrue, yPred):
    """
    Accuracy score for classification tasks using the label provided in file and predictions from multi-task model.
    It takes a batch of predictions and labels.

    To use this metric, add **classification_accuracy** into list of ``metrics`` in task file.

    Args:
        yPred (:obj:`list`) : [0, 2, 1, 3]
        yTrue (:obj:`list`) : [0, 1, 2, 3]

    """
    return accuracy_score(yTrue, yPred)*100

[docs]def classification_f1_score(yTrue, yPred):
    """
    Standard f1 score from sklearn for classification tasks.
    It takes a batch of predictions and labels.

    To use this metric, add **classification_f1_score** into list of ``metrics`` in task file.

    Args:
        yPred (:obj:`list`) : [0, 2, 1, 3]
        yTrue (:obj:`list`) : [0, 1, 2, 3]

    """
    return f1_score(yTrue, yPred, average='micro')

def classification_recall(yTrue, yPred):
    """
    Standard recall score from sklearn for classification tasks.
    It takes a batch of predictions and labels.

    To use this metric, add **classification_f1_score** into list of ``metrics`` in task file.

    Args:
        yPred (:obj:`list`) : [0, 2, 1, 3]
        yTrue (:obj:`list`) : [0, 1, 2, 3]

    """
    return class_recall_score(yTrue, yPred, average='micro')

[docs]def seqeval_f1_score(yTrue, yPred):
    """
    f1 score for NER/sequence labelling tasks taken from the `seqeval <https://github.com/chakki-works/seqeval>`_ library.
    
    To use this metric, add **seqeval_f1_score** into list of ``metrics`` in task file.

    Args:
        yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
    """
    return seq_f1(yTrue, yPred)

[docs]def seqeval_precision(yTrue, yPred):
    """
    Precision score for NER/sequence labelling tasks taken from the `seqeval <https://github.com/chakki-works/seqeval>`_ library.
    
    To use this metric, add **seqeval_precision** into list of ``metrics`` in task file.

    Args:
        yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
    """
    return precision_score(yTrue, yPred)

[docs]def seqeval_recall(yTrue, yPred):

    """
    Recall score for NER/sequence labelling tasks taken from the `seqeval <https://github.com/chakki-works/seqeval>`_ library.
    
    To use this metric, add **seqeval_recall** into list of ``metrics`` in task file.

    Args:
        yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
    """
    return recall_score(yTrue, yPred)


# compute f1 score is modified from conlleval.pl
def __startOfChunk(prevTag, tag, prevTagType, tagType, chunkStart = False):
    if prevTag == 'B' and tag == 'B':
        chunkStart = True
    if prevTag == 'I' and tag == 'B':
        chunkStart = True
    if prevTag == 'O' and tag == 'B':
        chunkStart = True
    if prevTag == 'O' and tag == 'I':
        chunkStart = True

    if prevTag == 'E' and tag == 'E':
        chunkStart = True
    if prevTag == 'E' and tag == 'I':
        chunkStart = True
    if prevTag == 'O' and tag == 'E':
        chunkStart = True
    if prevTag == 'O' and tag == 'I':
        chunkStart = True

    if tag != 'O' and tag != '.' and prevTagType != tagType:
        chunkStart = True
    return chunkStart

def __endOfChunk(prevTag, tag, prevTagType, tagType, chunkEnd = False):
    if prevTag == 'B' and tag == 'B':
        chunkEnd = True
    if prevTag == 'B' and tag == 'O':
        chunkEnd = True
    if prevTag == 'I' and tag == 'B':
        chunkEnd = True
    if prevTag == 'I' and tag == 'O':
        chunkEnd = True

    if prevTag == 'E' and tag == 'E':
        chunkEnd = True
    if prevTag == 'E' and tag == 'I':
        chunkEnd = True
    if prevTag == 'E' and tag == 'O':
        chunkEnd = True
    if prevTag == 'I' and tag == 'O':
        chunkEnd = True

    if prevTag != 'O' and prevTag != '.' and prevTagType != tagType:
        chunkEnd = True
    return chunkEnd

def __splitTagType(tag):
    s = tag.split('-')
    if len(s) > 2 or len(s) == 0:
        raise ValueError('tag format wrong. it must be B-xxx.xxx')
    if len(s) == 1:
        tag = s[0]
        tagType = ""
    else:
        tag = s[0]
        tagType = s[1]
    return tag, tagType

def computeF1Score(correct_slots, pred_slots):

    correctChunk = {}
    correctChunkCnt = 0
    foundCorrect = {}
    foundCorrectCnt = 0
    foundPred = {}
    foundPredCnt = 0
    correctTags = 0
    tokenCount = 0
    for correct_slot, pred_slot in zip(correct_slots, pred_slots):
        inCorrect = False
        lastCorrectTag = 'O'
        lastCorrectType = ''
        lastPredTag = 'O'
        lastPredType = ''
        for c, p in zip(correct_slot, pred_slot):
            correctTag, correctType = __splitTagType(c)
            predTag, predType = __splitTagType(p)

            if inCorrect == True:
                if __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                   __endOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                   (lastCorrectType == lastPredType):
                    inCorrect = False
                    correctChunkCnt += 1
                    if lastCorrectType in correctChunk:
                        correctChunk[lastCorrectType] += 1
                    else:
                        correctChunk[lastCorrectType] = 1
                elif __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) != \
                     __endOfChunk(lastPredTag, predTag, lastPredType, predType) or \
                     (correctType != predType):
                    inCorrect = False

            if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
               __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
               (correctType == predType):
                inCorrect = True

            if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True:
                foundCorrectCnt += 1
                if correctType in foundCorrect:
                    foundCorrect[correctType] += 1
                else:
                    foundCorrect[correctType] = 1

            if __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True:
                foundPredCnt += 1
                if predType in foundPred:
                    foundPred[predType] += 1
                else:
                    foundPred[predType] = 1

            if correctTag == predTag and correctType == predType:
                correctTags += 1

            tokenCount += 1

            lastCorrectTag = correctTag
            lastCorrectType = correctType
            lastPredTag = predTag
            lastPredType = predType

        if inCorrect == True:
            correctChunkCnt += 1
            if lastCorrectType in correctChunk:
                correctChunk[lastCorrectType] += 1
            else:
                correctChunk[lastCorrectType] = 1

    if foundPredCnt > 0:
        precision = 100*correctChunkCnt/foundPredCnt
    else:
        precision = 0

    if foundCorrectCnt > 0:
        recall = 100*correctChunkCnt/foundCorrectCnt
    else:
        recall = 0

    if (precision+recall) > 0:
        f1 = (2*precision*recall)/(precision+recall)
    else:
        f1 = 0

    return f1, precision, recall

[docs]def snips_f1_score(yTrue, yPred):
    
    """
    f1 score for SNIPS NER/Slot filling task taken from the `MiuLab <https://github.com/MiuLab/SlotGated-SLU/blob/master/utils.py>`_ library.
    
    To use this metric, add **snips_f1_score** into list of ``metrics`` in task file.

    Args:
        yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        
    """
    
    snipsF1, _, _ = computeF1Score(yTrue, yPred)
    return snipsF1

[docs]def snips_precision(yTrue, yPred):
    """
    Precision score for SNIPS NER/Slot filling task taken from the `MiuLab <https://github.com/MiuLab/SlotGated-SLU/blob/master/utils.py>`_ library.
    
    To use this metric, add **snips_precision** into list of ``metrics`` in task file.

    Args:
        yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        
    """
    
    _, snipsPrecision, _ = computeF1Score(yTrue, yPred)
    return snipsPrecision
    
[docs]def snips_recall(yTrue, yPred):
    
    """
    Recall score for SNIPS NER/Slot filling task taken from the `MiuLab <https://github.com/MiuLab/SlotGated-SLU/blob/master/utils.py>`_ library.
    
    To use this metric, add **snips_recall** into list of ``metrics`` in task file.

    Args:
        yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        
    """
    _, _, snipsRecall = computeF1Score(yTrue, yPred)
    return snipsRecall