Install spacy and its models via, (there is a gpu version as well)
pip install spacy
python -m spacy download en_core_web_{sm,md,lg} # small, medium or large
python -m spacy validate
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp('Larry Page founded Google')
for token in doc:
print(token.text) # Larry\nPage\n...
Other attirbutes,
token.i
token.is_alpha
token.is_punct
token.like_num
token.pos
(id) token.pos_
(tag name)token.ent_type_
from spacy import displacy
displacy.render(doc, style="ent")
token.dep_
doc = nlp("This is a sentence")
displacy.render(doc, style="dep")
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "new"}, {"LOWER": "york"}]
matcher.add('CITIES', None, pattern)
matches = matcher(doc)
for match_id, start, end in matches:
# Get the matched span by slicing the Doc
span = doc[start:end]
print(span.text)
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)
from spacy.matcher import DependencyMatcher
# "[subject] ... initially founded"
pattern = [
# anchor token: founded
{
"RIGHT_ID": "founded",
"RIGHT_ATTRS": {"ORTH": "founded"}
},
# founded -> subject
{
"LEFT_ID": "founded",
"REL_OP": ">",
"RIGHT_ID": "subject",
"RIGHT_ATTRS": {"DEP": "nsubj"}
},
# "founded" follows "initially"
{
"LEFT_ID": "founded",
"REL_OP": ";",
"RIGHT_ID": "initially",
"RIGHT_ATTRS": {"ORTH": "initially"}
}
]
matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [pattern])
matches = matcher(doc)
Only available in medium or large models.
doc1.similarity(doc2)
spacy.explain('NNP')