spaCy Entity Ruler, Tag und Zeit
Dieses Notebook: Replikation der Beispiele aus, Ziel: gucken, was läuft; sowie davon ausgehend kleine spielerische Veränderungen.
Challenge: Erkenne Tag und Zeit in DE-Texten, z.B.
Dienstags ab 18 Uhr
Freies Spiel: Freitags 17 bis 19:30 auf dem Sportplatz
Version 2022-05-30
import spacy
from spacy import displacy
# conda install -c conda-forge spacy-model-en_core_web_sm
# EN: nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("de_core_news_sm")
Ein gelöstes Problem - aber benötigen wir die Lösung überhaupt? Nur für den Fall.
Q: “Is there a way to add an regexp-keyed exception, to, say, match phone number?”
A: “No, there’s no way to have regular expressions as tokenizer exceptions. The tokenizer only looks for exceptions as exact string matches, mainly for reasons of speed. The other difficulty for this kind of example is that tokenizer exceptions currently can’t contain spaces. (Support for spaces is planned for a future version of spacy, but not regexes, which would still be too slow.) … I think the best way to do this would be to add a custom pipeline component at the beginning of the pipeline that retokenizes the document with the retokenizer:” source
Wir erweitern die Pipeline um eine frühe Komponente, die per regex die vorhandenen, in einzelne Tokens zerteilte Markdown-Links wieder zu einzelnen spans zusammenzieht.
from spacy.language import Language
import re
def markdownLinks_2_spans(doc):
# > Matching regular expressions on the full text
# expression = r"""\s((([01]?\d)|(2[0123]))([\.,:]\d\d)? )(\s*Uhr)?\D"""
expression = r"(([012]?\d\d)([\.,:]\d\d)?)(\s*Uhr)?"
# expression = r"(([012]?\d)|(2[0123]{1}))([\.,:]\d\d)?(\s*Uhr)?"
for match in re.finditer(expression, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
# This is a Span object or None if match doesn't map to valid token sequence
if span is not None:
print("Found match:", span.text)
with doc.retokenize() as retokenizer:
attrs = {'LEMMA':,
"POS" : "NOUN",
"ENT_TYPE": "Uhrzeit"}
retokenizer.merge(span, attrs = attrs)
return doc
# nlp.remove_pipe("markdownLinks_2_spans")
# In der Pipeline registrieren
<function __main__.markdownLinks_2_spans(doc)>
text = ("""Wir treffen uns Dienstags
von 17.15 Uhr bis 19 Uhr,
und an Donnerstagen um 18:30 Uhr.
Am 25.07 findet keine Probe statt.""" )
doc = nlp(text)
Found match: 17.15 Uhr
Found match: 19 Uhr
Found match: 18:30 Uhr
Found match: 25.07
[ (token.text, token.pos_, token.lemma_) for token in doc]
[('Wir', 'PRON', 'wir'),
('treffen', 'VERB', 'treffen'),
('uns', 'PRON', 'sich'),
('Dienstags', 'ADV', 'dienstags'),
('\n', 'SPACE', '\n'),
('von', 'ADP', 'von'),
('17.15 Uhr', 'PROPN', '17.15'),
('bis', 'ADP', 'bis'),
('19 Uhr', 'PROPN', '19'),
(',', 'PUNCT', '--'),
('\n', 'SPACE', '\n'),
('und', 'CCONJ', 'und'),
('an', 'ADP', 'an'),
('Donnerstagen', 'NOUN', 'Donnerstag'),
('um', 'ADP', 'um'),
('18:30 Uhr', 'NOUN', '18:30'),
('.', 'PUNCT', '--'),
('\n', 'SPACE', '\n'),
('Am', 'ADP', 'an'),
('25.07', 'NUM', '25.07'),
('findet', 'VERB', 'finden'),
('keine', 'DET', 'kein'),
('Probe', 'NOUN', 'Probe'),
('statt', 'ADV', 'statt'),
('.', 'PUNCT', '--')]
Interessant: 17.15 Uhr
wird als PROPN
getaggt, aber 18:30 Uhr
als ADJ
: Da muss noch etwas besser werden.
ABER Einsicht: Der Tokenizer hat nur die Aufgabe, Tokens zu erzeugen. Es ist nicht möglich und sinnvoll, im Tokenizer schon erkennbare und erkannte Named Entities etc. mitzugeben.
Rule-based matching#
“””Rule-based systems are a good choice if there’s a more or less finite number of examples that you want to find in the data, or if there’s a very clear, structured pattern you can express with token rules or regular expressions. For instance, country names, IP addresses or URLs are things you might be able to handle well with a purely rule-based approach.”””
Token based matching#
text = """Hello, world!
Hello world!
Welcome students, definately!
Servus, Barak Obama; Servus Erika!
Hello Google I/O!
Apple is opening its first big office in San Francisco."""
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])
#Optionally, we could also choose to add more than one pattern,
#for example to also match sequences without punctuation between “hello” and “world”:
patterns = [
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
[{"LOWER": "hello"}, {"LOWER": "world"}],
[{"LEMMA": {"IN": ["hello", "welcome"]}},
{"POS": "NOUN"}],
[{"TEXT": {"REGEX": "deff?in[ia]tely"}}],
[{"LOWER": "servus"},
{"IS_PUNCT": True, "OP": "?"}],
[{"ORTH": "Servus"}, {"IS_PUNCT": True, "OP": "?"}, {}, {}]
matcher.add("HelloWorld", patterns)
doc = nlp(text)
matches = matcher(doc)
for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id] # Get string representation
span = doc[start:end] # The matched span
print(match_id, string_id, start, end, span.text, "|", span.lemma_)
15578876784678163569 HelloWorld 0 3 Hello, world | hello, world
15578876784678163569 HelloWorld 5 7 Hello world | Hello world
15578876784678163569 HelloWorld 9 11 Welcome students | welcome student
15578876784678163569 HelloWorld 12 13 definately | definately
15578876784678163569 HelloWorld 15 16 Servus | Servus
15578876784678163569 HelloWorld 15 17 Servus, | Servus,
15578876784678163569 HelloWorld 15 18 Servus, Barak | Servus, Barak
15578876784678163569 HelloWorld 15 19 Servus, Barak Obama | Servus, Barak Obama
15578876784678163569 HelloWorld 20 21 Servus | Servus
15578876784678163569 HelloWorld 20 23 Servus Erika! | Servus Erika!
Matching regular expressions on the full text#
import re
nlp = spacy.load("en_core_web_sm")
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")
expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
# This is a Span object or None if match doesn't map to valid token sequence
if span is not None:
print("Found match:", span.text)
Found match: United States
Found match: United States
Found match: U.S.
Found match: US
Adding on_match rules#
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp = English()
matcher = Matcher(nlp.vocab)
def add_event_ent(matcher, doc, i, matches):
# Get the current match and create tuple of entity label, start and end.
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
match_id, start, end = matches[i]
entity = Span(doc, start, end, label="EVENT")
doc.ents += (entity,)
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp(text)
matches = matcher(doc)
Google I/O
from spacy import displacy
html = displacy.render(doc, style="ent", page=True,
options={"ents": ["EVENT"]})
Creating spans from matches#
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
doc = nlp("Barack Obama was the 44th president of the United States")
# 1. Return (match_id, start, end) tuples
matches = matcher(doc)
for match_id, start, end in matches:
# Create the matched span and assign the match_id as a label
span = Span(doc, start, end, label=match_id)
print(span.text, span.label_)
# 2. Return Span objects directly
matches = matcher(doc, as_spans=True)
for span in matches:
print(span.text, span.label_)
Barack Obama PERSON
Barack Obama PERSON
Using custom pipeline components#
import spacy
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
# We're using a component factory because the component needs to be
# initialized with the shared vocab via the nlp object
def create_bad_html_merger(nlp, name):
return BadHTMLMerger(nlp.vocab)
class BadHTMLMerger:
def __init__(self, vocab):
patterns = [
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
# Register a new token extension to flag bad HTML
Token.set_extension("bad_html", default=False)
self.matcher = Matcher(vocab)
self.matcher.add("BAD_HTML", patterns)
def __call__(self, doc):
# This method is invoked when the component is called on a Doc
matches = self.matcher(doc)
spans = [] # Collect the matched spans here
for match_id, start, end in matches:
with doc.retokenize() as retokenizer:
for span in spans:
for token in span:
token._.bad_html = True # Mark token as bad HTML
return doc
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("html_merger", last=True) # Add component to the pipeline
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
print(token.text, token._.bad_html)
Hello False
<br> True
world False
! False
<br/> True
This False
is False
a False
test False
. False
Example: Phone numbers#
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", [pattern])
doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
['Call', 'me', 'at', '(', '123', ')', '456', '789', 'or', '(', '123', ')', '456', '789', '!']
(123) 456 789
(123) 456 789
Entity Patterns#
from spacy.lang.en import English
nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])
[('Apple', 'ORG'), ('San Francisco', 'GPE')]
from spacy.lang.en import English
nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"},
{"label": "PHONE_DE", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}], "id": "phone-de"}
doc1 = nlp(text)
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran: call (040) 123-456")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco'), ('(040) 123-456', 'PHONE_DE', 'phone-de')]
from spacy.lang.en import English
nlp = English()
ruler = nlp.add_pipe("entity_ruler")
entity_ruler_patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
entity_ruler_patterns_2 = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
{"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"},
{"label": "Greeting", "pattern": [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], "id": "hello"},
{"label": "Greeting", "pattern": [{"LOWER": "hello"}, {"LOWER": "world"}], "id": "hello"},
{"label": "Greeting", "pattern": [{"LEMMA": {"IN": ["hello", "welcome"]}},
{"POS": "NOUN"}], "id": "hello"},
{"label": "Greeting", "pattern": [{"TEXT": {"REGEX": "deff?in[ia]tely"}}], "id": "definitely"},
{"label": "Bayerisch", "pattern": [{"LOWER": "servus"},
{"IS_PUNCT": True, "OP": "?"}], "id": "hello"},
{"label": "Bayerisch", "pattern": [{"ORTH": "Servus"}, {"IS_PUNCT": True, "OP": "?"}, {}, {}], "id": "hello"},
# wirft einen Fehler:
# ruler.add_patterns(entity_ruler_patterns_2)
# ValueError: [E155] The pipeline needs to include a morphologizer or tagger+attribute_ruler
# in order to use Matcher or PhraseMatcher with the attribute POS.
# Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())`
# instead of `list(nlp.tokenizer.pipe())`.
doc1 = nlp(text)
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco')]
Das sind die Patterns, die wir suchen:
phrase_matcher_patterns = [
{"label": "Wochentag",
"pattern": "Montag",
"id": "ex:Montag"},
{"label": "Wochentag",
"pattern": "Dienstag",
"id": "ex:Dienstag"},
{"label": "Wochentag",
"pattern": "Mittwoch",
"id": "ex:Mittwoch"},
{"label": "Wochentag",
"pattern": "Donnerstag",
"id": "ex:Donnerstag"},
{"label": "Wochentag",
"pattern": "Freitag",
"id": "ex:Freitag"},
{"label": "Wochentag",
"pattern": "Samstag",
"id": "ex:Samstag"},
{"label": "Wochentag",
"pattern": "Sonntag",
"id": "ex:Sonntag"}
Entity Ruler#
Den Entity Ruler in die Pipeline einfügen:
ruler = nlp.add_pipe( "entity_ruler",
config={"phrase_matcher_attr": "LEMMA"})
ruler.add_patterns(phrase_matcher_patterns) # patterns siehe oben
text2 = """Judo-Minis 5 bis 7 Jahre: Dienstag, 17 bis 18 Uhr, Donnerstag 16.30 bis 17.30 Uhr
Anfänger 8-12 Jahre: Dienstag, 18 bis 19 Uhr, Donnerstag 17.30 bis 18.30 Uhr
Kinder 8 bis 10 Jahre: Montag und Mittwoch, 17 bis 18.30 Uhr
Jugend: Montag und Mittwoch 18.30 bis 20 Uhr
Erwachsene: Montag 20 bis 21.30 Uhr und Freitag 19.30 bis 21 Uhr
Wettkampftraining Jugend: Donnerstag, 18.30 bis 20 Uhr"""
text = """Falls die [glückliche Milch AG](,17,23)) mehr Milch fettfreier macht,
hat sie mehr skimmed milk!"""
doc = nlp(text)
[ (token.text, token.pos_, token.lemma_) for token in doc]
html = displacy.render(doc, style="ent", jupyter=True)
import spacy
import re
nlp = spacy.load("en_core_web_sm")
text = """The United States of America (USA)
are commonly known as the United States (U.S. or US) or America.
And this is a text about Google I/O.
Barack Obamas hope was the 44th presidentship of the United States"""
doc = nlp(text)
expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
start, end = match.span()
span = doc.char_span(start, end)
# This is a Span object or None if match doesn't map to valid token sequence
if span is not None:
print("Found match:", span.text)
# JB
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Span
matcher = Matcher(nlp.vocab)
def add_weekday_ent(matcher, doc, i, matches):
# Donnerstag, Donnerstags, an Donnerstagen et.
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
match_id, start, end = matches[i]
entity = Span(doc, start, end, label="EVENT")
doc.ents += (entity,)
pattern = [{"LEMMA": "thursday"}, {"LEMMA": "evening"}]
matcher.add("Wochentag", [pattern], on_match=add_weekday_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)
[ (token.text, token.pos_, token.lemma_) for token in doc]
from spacy.matcher import Matcher
from spacy.tokens import Span
#nlp = English()
matcher = Matcher(nlp.vocab)
def add_event_ent(matcher, doc, i, matches):
# Get the current match and create tuple of entity label, start and end.
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
match_id, start, end = matches[i]
entity = Span(doc, start, end, label="EVENT")
doc.ents += (entity,)
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)