Install
Interactive installer
1
2
| import nltk
nltk.download()
|
Tag |
Desc |
CC |
Coordinating conjunction |
CD |
Cardinal number |
DT |
Determiner |
IN |
Preposition or subordinating conjunction |
JJ |
Adjective |
NN |
Noun, singular or mass |
NNS |
Noun, plural |
POS |
Possessive ending |
PRP |
Personal pronoun |
RB |
Adverb |
VB |
Verb, base form |
WP |
Wh-pronoun |
TO |
to |
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
https://www.nltk.org/
Hello World
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| import nltk
from collections import Counter
with open('Harry Potter and the Sorcerer\'s Stone.txt') as f:
hp = f.read()
tokens = nltk.word_tokenize(hp)
tagged = nltk.pos_tag(tokens)
c = Counter()
nnps = list(filter(lambda t:t[1] in ['NNP', 'NNPS'], tagged))
for t in nnps:
c[t] += 1
c.most_common(20)
|
- tokenizing - word tokenizers, sentence tokenizers
- corpora - body of text. ex: medical journals, presidential speeches
- lexicon - words and their means
Stop Words
1
2
3
4
| from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in tokens if not w in stop_words]
|
Stemming
1
2
3
4
| from nltk.stem import PorterStemmer
ps = PorterStemmer()
words = list(map(lambda w: ps.stem(w), tokens))
|
Tagging