spaCy Entity Ruler, Tag und Zeit#

Dieses Notebook: Replikation der Beispiele aus https://spacy.io/usage/rule-based-matching, Ziel: gucken, was läuft; sowie davon ausgehend kleine spielerische Veränderungen.

Challenge: Erkenne Tag und Zeit in DE-Texten, z.B.

  • Dienstags ab 18 Uhr

  • Freies Spiel: Freitags 17 bis 19:30 auf dem Sportplatz

Version 2022-05-30

import spacy
from spacy import displacy

# conda install -c conda-forge spacy-model-en_core_web_sm
# EN: nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("de_core_news_sm")

Re-Tokenize#

Ein gelöstes Problem - aber benötigen wir die Lösung überhaupt? Nur für den Fall.

Q: “Is there a way to add an regexp-keyed exception, to, say, match phone number?”

A: “No, there’s no way to have regular expressions as tokenizer exceptions. The tokenizer only looks for exceptions as exact string matches, mainly for reasons of speed. The other difficulty for this kind of example is that tokenizer exceptions currently can’t contain spaces. (Support for spaces is planned for a future version of spacy, but not regexes, which would still be too slow.) … I think the best way to do this would be to add a custom pipeline component at the beginning of the pipeline that retokenizes the document with the retokenizer: https://spacy.io/api/doc#retokenize.” source

Wir erweitern die Pipeline um eine frühe Komponente, die per regex die vorhandenen, in einzelne Tokens zerteilte Markdown-Links wieder zu einzelnen spans zusammenzieht.

from spacy.language import Language
import re

# https://spacy.io/usage/processing-pipelines#custom-components-simple
# https://spacy.io/usage/processing-pipelines#custom-components

@Language.component("markdownLinks_2_spans")
def markdownLinks_2_spans(doc):
    # https://spacy.io/usage/rule-based-matching#regex-text > Matching regular expressions on the full text
    
    # expression = r"""\s((([01]?\d)|(2[0123]))([\.,:]\d\d)? )(\s*Uhr)?\D"""
    expression = r"(([012]?\d\d)([\.,:]\d\d)?)(\s*Uhr)?"
    # expression = r"(([012]?\d)|(2[0123]{1}))([\.,:]\d\d)?(\s*Uhr)?"
    
    for match in re.finditer(expression, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        # This is a Span object or None if match doesn't map to valid token sequence
        if span is not None:
            print("Found match:", span.text)
            # https://spacy.io/api/doc#retokenizer.merge
            with doc.retokenize() as retokenizer:
                # https://spacy.io/usage/rule-based-matching#adding-patterns-attributes
                attrs = {'LEMMA': match.group(1), 
                         "POS" : "NOUN", 
                         "ENT_TYPE": "Uhrzeit"}
                retokenizer.merge(span, attrs = attrs)
    return doc
# nlp.remove_pipe("markdownLinks_2_spans")
# In der Pipeline registrieren
nlp.add_pipe("markdownLinks_2_spans", 
             name="markdownLinks_2_spans", 
             first=True)
<function __main__.markdownLinks_2_spans(doc)>
text =  ("""Wir treffen uns Dienstags 
von 17.15 Uhr bis 19 Uhr, 
und an Donnerstagen um 18:30 Uhr. 
Am 25.07 findet keine Probe statt.""" )
doc = nlp(text)
Found match: 17.15 Uhr
Found match: 19 Uhr
Found match: 18:30 Uhr
Found match: 25.07
[ (token.text, token.pos_, token.lemma_) for token in doc]
[('Wir', 'PRON', 'wir'),
 ('treffen', 'VERB', 'treffen'),
 ('uns', 'PRON', 'sich'),
 ('Dienstags', 'ADV', 'dienstags'),
 ('\n', 'SPACE', '\n'),
 ('von', 'ADP', 'von'),
 ('17.15 Uhr', 'PROPN', '17.15'),
 ('bis', 'ADP', 'bis'),
 ('19 Uhr', 'PROPN', '19'),
 (',', 'PUNCT', '--'),
 ('\n', 'SPACE', '\n'),
 ('und', 'CCONJ', 'und'),
 ('an', 'ADP', 'an'),
 ('Donnerstagen', 'NOUN', 'Donnerstag'),
 ('um', 'ADP', 'um'),
 ('18:30 Uhr', 'NOUN', '18:30'),
 ('.', 'PUNCT', '--'),
 ('\n', 'SPACE', '\n'),
 ('Am', 'ADP', 'an'),
 ('25.07', 'NUM', '25.07'),
 ('findet', 'VERB', 'finden'),
 ('keine', 'DET', 'kein'),
 ('Probe', 'NOUN', 'Probe'),
 ('statt', 'ADV', 'statt'),
 ('.', 'PUNCT', '--')]

Interessant: 17.15 Uhr wird als PROPN getaggt, aber 18:30 Uhr als ADJ: Da muss noch etwas besser werden.

ABER Einsicht: Der Tokenizer hat nur die Aufgabe, Tokens zu erzeugen. Es ist nicht möglich und sinnvoll, im Tokenizer schon erkennbare und erkannte Named Entities etc. mitzugeben.

Rule-based matching#

“””Rule-based systems are a good choice if there’s a more or less finite number of examples that you want to find in the data, or if there’s a very clear, structured pattern you can express with token rules or regular expressions. For instance, country names, IP addresses or URLs are things you might be able to handle well with a purely rule-based approach.”””https://spacy.io/usage/rule-based-matching

Token based matching#

https://spacy.io/usage/rule-based-matching#matcher

https://spacy.io/usage/rule-based-matching#adding-patterns-attributes

text = """Hello, world! 
    Hello world! 
    Welcome students, definately!
    Servus, Barak Obama; Servus Erika!
    Hello Google I/O!
    Apple is opening its first big office in San Francisco."""
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])

#Optionally, we could also choose to add more than one pattern, 
#for example to also match sequences without punctuation between “hello” and “world”:
patterns = [
    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
    [{"LOWER": "hello"}, {"LOWER": "world"}],
    [{"LEMMA": {"IN": ["hello", "welcome"]}},
            {"POS": "NOUN"}],
    [{"TEXT": {"REGEX": "deff?in[ia]tely"}}],
    [{"LOWER": "servus"},
           {"IS_PUNCT": True, "OP": "?"}],
    [{"ORTH": "Servus"}, {"IS_PUNCT": True, "OP": "?"}, {}, {}]
]
matcher.add("HelloWorld", patterns)

doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text, "|", span.lemma_)
15578876784678163569 HelloWorld 0 3 Hello, world | hello, world
15578876784678163569 HelloWorld 5 7 Hello world | Hello world
15578876784678163569 HelloWorld 9 11 Welcome students | welcome student
15578876784678163569 HelloWorld 12 13 definately | definately
15578876784678163569 HelloWorld 15 16 Servus | Servus
15578876784678163569 HelloWorld 15 17 Servus, | Servus,
15578876784678163569 HelloWorld 15 18 Servus, Barak | Servus, Barak
15578876784678163569 HelloWorld 15 19 Servus, Barak Obama | Servus, Barak Obama
15578876784678163569 HelloWorld 20 21 Servus | Servus
15578876784678163569 HelloWorld 20 23 Servus Erika! | Servus Erika!

Matching regular expressions on the full text#

https://spacy.io/usage/rule-based-matching#regex-text

import re

nlp = spacy.load("en_core_web_sm")
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)
Found match: United States
Found match: United States
Found match: U.S.
Found match: US

Adding on_match rules#

https://spacy.io/usage/rule-based-matching#on_match

from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span

nlp = English()
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp(text)
matches = matcher(doc)
Google I/O
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
                       options={"ents": ["EVENT"]})
displaCy
Hello, world!
Hello world!
Welcome students, definately!
Servus, Barak Obama; Servus Erika!
Hello Google I/O EVENT !
Apple is opening its first big office in San Francisco.

Creating spans from matches#

https://spacy.io/usage/rule-based-matching#matcher-spans

nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
doc = nlp("Barack Obama was the 44th president of the United States")

# 1. Return (match_id, start, end) tuples
matches = matcher(doc)
for match_id, start, end in matches:
    # Create the matched span and assign the match_id as a label
    span = Span(doc, start, end, label=match_id)
    print(span.text, span.label_)

# 2. Return Span objects directly
matches = matcher(doc, as_spans=True)
for span in matches:
    print(span.text, span.label_)
Barack Obama PERSON
Barack Obama PERSON

Using custom pipeline components#

https://spacy.io/usage/rule-based-matching#matcher-pipeline

import spacy
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token

# We're using a component factory because the component needs to be
# initialized with the shared vocab via the nlp object
@Language.factory("html_merger")
def create_bad_html_merger(nlp, name):
    return BadHTMLMerger(nlp.vocab)

class BadHTMLMerger:
    def __init__(self, vocab):
        patterns = [
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        ]
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("BAD_HTML", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.bad_html = True  # Mark token as bad HTML
        return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("html_merger", last=True)  # Add component to the pipeline
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
    print(token.text, token._.bad_html)
Hello False
<br> True
world False
! False
<br/> True
This False
is False
a False
test False
. False

Example: Phone numbers#

https://spacy.io/usage/rule-based-matching#example2

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", [pattern])

doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)
['Call', 'me', 'at', '(', '123', ')', '456', '789', 'or', '(', '123', ')', '456', '789', '!']
(123) 456 789
(123) 456 789

Entity Patterns#

https://spacy.io/usage/rule-based-matching#entityruler-patterns

from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)

doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])
[('Apple', 'ORG'), ('San Francisco', 'GPE')]
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"},
            {"label": "PHONE_DE", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}], "id": "phone-de"}
           ]
ruler.add_patterns(patterns)

doc1 = nlp(text)
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran: call (040) 123-456")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco'), ('(040) 123-456', 'PHONE_DE', 'phone-de')]
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
entity_ruler_patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
entity_ruler_patterns_2 = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
    {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
    {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"},
    {"label": "Greeting", "pattern": [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], "id": "hello"},
    {"label": "Greeting", "pattern": [{"LOWER": "hello"}, {"LOWER": "world"}], "id": "hello"},
    {"label": "Greeting", "pattern": [{"LEMMA": {"IN": ["hello", "welcome"]}},
            {"POS": "NOUN"}], "id": "hello"},
    {"label": "Greeting", "pattern": [{"TEXT": {"REGEX": "deff?in[ia]tely"}}], "id": "definitely"},
    {"label": "Bayerisch", "pattern":  [{"LOWER": "servus"},
           {"IS_PUNCT": True, "OP": "?"}], "id": "hello"},
    {"label": "Bayerisch", "pattern": [{"ORTH": "Servus"}, {"IS_PUNCT": True, "OP": "?"}, {}, {}], "id": "hello"},
]
ruler.add_patterns(entity_ruler_patterns)

# wirft einen Fehler:
# ruler.add_patterns(entity_ruler_patterns_2)
# ValueError: [E155] The pipeline needs to include a morphologizer or tagger+attribute_ruler 
# in order to use Matcher or PhraseMatcher with the attribute POS. 
# Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` 
# instead of `list(nlp.tokenizer.pipe())`.

doc1 = nlp(text)
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco')]

Das sind die Patterns, die wir suchen:

phrase_matcher_patterns = [
    {"label": "Wochentag", 
     "pattern": "Montag", 
     "id": "ex:Montag"},
    {"label": "Wochentag", 
     "pattern": "Dienstag", 
     "id": "ex:Dienstag"},
    {"label": "Wochentag", 
     "pattern": "Mittwoch", 
     "id": "ex:Mittwoch"},
    {"label": "Wochentag", 
     "pattern": "Donnerstag", 
     "id": "ex:Donnerstag"},
    {"label": "Wochentag", 
     "pattern": "Freitag", 
     "id": "ex:Freitag"},
    {"label": "Wochentag", 
     "pattern": "Samstag", 
     "id": "ex:Samstag"},
    {"label": "Wochentag", 
     "pattern": "Sonntag", 
     "id": "ex:Sonntag"}
]

Entity Ruler#

https://spacy.io/usage/rule-based-matching#entityruler-ent-ids

Den Entity Ruler in die Pipeline einfügen:

#nlp.remove_pipe("entity_ruler")

Den Entity Ruler in die Pipeline einfügen:

ruler = nlp.add_pipe( "entity_ruler", 
    config={"phrase_matcher_attr": "LEMMA"})
ruler.add_patterns(phrase_matcher_patterns)  # patterns siehe oben
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [22], line 1
----> 1 ruler = nlp.add_pipe( "entity_ruler", 
      2     config={"phrase_matcher_attr": "LEMMA"})
      3 ruler.add_patterns(phrase_matcher_patterns)

File ~/miniconda3/lib/python3.9/site-packages/spacy/language.py:776, in Language.add_pipe(self, factory_name, name, before, after, first, last, source, config, raw_config, validate)
    774 name = name if name is not None else factory_name
    775 if name in self.component_names:
--> 776     raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
    777 # Overriding pipe name in the config is not supported and will be ignored.
    778 if "name" in config:

ValueError: [E007] 'entity_ruler' already exists in pipeline. Existing names: ['entity_ruler']
text2 = """Judo-Minis 5 bis 7 Jahre: Dienstag, 17 bis 18 Uhr, Donnerstag 16.30 bis 17.30 Uhr
Anfänger 8-12 Jahre: Dienstag, 18 bis 19 Uhr, Donnerstag 17.30 bis 18.30 Uhr
Kinder 8 bis 10 Jahre: Montag und Mittwoch, 17 bis 18.30 Uhr
Jugend: Montag und Mittwoch 18.30 bis 20 Uhr
Erwachsene: Montag 20 bis 21.30 Uhr und Freitag 19.30 bis 21 Uhr
Wettkampftraining Jugend: Donnerstag, 18.30 bis 20 Uhr"""

text = """Falls die [glückliche Milch AG](https://www.weideglueck.de/cms?source=(33,17,23)) mehr Milch fettfreier macht, 
hat sie mehr skimmed milk!"""
doc = nlp(text)
[ (token.text, token.pos_, token.lemma_) for token in doc]
html = displacy.render(doc, style="ent", jupyter=True)
tag_text_with_ents(doc)

Matcher#

https://spacy.io/usage/rule-based-matching

import spacy
import re

nlp = spacy.load("en_core_web_sm")
text = """The United States of America (USA)
          are commonly known as the United States (U.S. or US) or America.
          And this is a text about Google I/O.
          Barack Obamas hope was the 44th presidentship of the United States"""
doc = nlp(text)

expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)
# JB

from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span

matcher = Matcher(nlp.vocab)
def add_weekday_ent(matcher, doc, i, matches):
    # Donnerstag, Donnerstags, an Donnerstagen et.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

pattern = [{"LEMMA": "thursday"}, {"LEMMA": "evening"}]
matcher.add("Wochentag", [pattern], on_match=add_weekday_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)
[ (token.text, token.pos_, token.lemma_) for token in doc]
from spacy.matcher import Matcher
from spacy.tokens import Span

#nlp = English()
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)