NLTK (Natural Language Toolkit) tokenization and tagging

bogotobogo.com site search:

NLTK Tutorials

Introduction - Install NLTK

Tokenizing and Tagging

Stemming

Chunking

tf-idf

tokenization and tagging

NLTK provides support for a wide variety of text processing tasks. In this section, we'll do tokenization and tagging.

We're going to use Steinbeck Pearl Ch. 3 as an input.

import nltk
from collections import Counter

def get_tokens():
	with open('/home/k/TEST/NLTK/Pearl3.txt') as pearl:
		tokens = nltk.word_tokenize(pearl.read())
	return tokens

if __name__ == "__main__":

	tokens = get_tokens()
	print("tokens=%s") %(tokens)

	count = Counter(tokens)
	print("len(count) = %s") %(len(count))
	print("most_common = %s") %(count.most_common(10))

	tagged = nltk.pos_tag(tokens)
	print("tagged=%s") %(tagged)

Output:

tokens[:20]=['Chapter', '3', 'A', 'town', 'is', 'a', 'thing', 'like', 'a', 'colonial', 'animal.', 'A', 'town', 'has', 'a', 'nervous', 'system', 'and', 'a', 'head']
len(count) = 1459
most_common = [('the', 429), ('and', 314), (',', 313), ('a', 145), ('of', 138), ('he', 122), ('in', 115), ('his', 110), ('to', 104), ('Kino', 98)]
tagged[:20]=[('Chapter', 'NN'), ('3', 'CD'), ('A', 'DT'), ('town', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('thing', 'NN'), ('like', 'IN'), ('a', 'DT'), ('colonial', 'JJ'), ('animal.', 'NNP'), ('A', 'DT'), ('town', 'NN'), ('has', 'VBZ'), ('a', 'DT'), ('nervous', 'JJ'), ('system', 'NN'), ('and', 'CC'), ('a', 'DT'), ('head', 'NN')]

As we see from the output for most_common, we have (',', 313). So, if we don't want to process any punctuation, we can use the character deletion step of translate():

import nltk
from collections import Counter
import string

def get_tokens():
	with open('/home/k/TEST/NLTK/Pearl3.txt') as pearl:
		tokens = nltk.word_tokenize(pearl.read().translate(None, string.punctuation))
	return tokens

if __name__ == "__main__":

	tokens = get_tokens()
	print("tokens[:20]=%s") %(tokens[:20])

	count = Counter(tokens)
	print("len(count) = %s") %(len(count))
	print("most_common = %s") %(count.most_common(10))

	tagged = nltk.pos_tag(tokens)
	print("tagged[:20]=%s") %(tagged[:20])

Then, we get an output processed without any punctuation:

tokens[:20]=['Chapter', '3', 'A', 'town', 'is', 'a', 'thing', 'like', 'a', 'colonial', 'animal', 'A', 'town', 'has', 'a', 'nervous', 'system', 'and', 'a', 'head']
len(count) = 1299
most_common = [('the', 429), ('and', 315), ('a', 145), ('of', 138), ('he', 122), ('in', 120), ('his', 110), ('to', 104), ('it', 89), ('was', 84)]
tagged[:20]=[('Chapter', 'NN'), ('3', 'CD'), ('A', 'DT'), ('town', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('thing', 'NN'), ('like', 'IN'), ('a', 'DT'), ('colonial', 'JJ'), ('animal', 'JJ'), ('A', 'DT'), ('town', 'NN'), ('has', 'VBZ'), ('a', 'DT'), ('nervous', 'JJ'), ('system', 'NN'), ('and', 'CC'), ('a', 'DT'), ('head', 'NN')]

Now the size of the count has been reduced from len(count) = 1459 => len(count) = 1299.

Removing Stop Words I

Sometimes, common words which would appear to be of little value for selecting documents matching are excluded from the vocabulary entirely. These words are called stop words. The general strategy for determining a stop list is to sort the terms by frequency, and then to label the most frequent terms as a stop list. Then, they are discarded during indexing.

The example of the list: a, an, the, he, by, it, are, etc.

Before using the 'stopwords', we need to download the list from nltk, otherwise we may get the error like this:

LookupError: 
**********************************************************************
  Resource 'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource: >>> nltk.download().

We may follow the steps after issuing nltk.download() or we can do the following:

>>> import nltk
>>> nltk.download('stopwords')

Now, we can use the list:

>>> from nltk.corpus import stopwords
>>> stopwords.words('english')
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']

Let's incorporate the stopwords into our previous code:

import nltk
from collections import Counter
import string
from nltk.corpus import stopwords

def get_tokens():
	with open('/home/k/TEST/NLTK/Pearl3.txt') as pearl:
		tokens = nltk.word_tokenize(pearl.read().translate(None, string.punctuation))
	return tokens

if __name__ == "__main__":

	tokens = get_tokens()
	print("tokens[:20]=%s") %(tokens[:20])
	
	count = Counter(tokens)
	print("before: len(count) = %s") %(len(count))

	filtered = [w for w in tokens if not w in stopwords.words('english')]

	print("filtered tokens[:20]=%s") %(filtered[:20])
	
	count = Counter(filtered)
	print("after: len(count) = %s") %(len(count))
	
	print("most_common = %s") %(count.most_common(10))

	tagged = nltk.pos_tag(tokens)
	print("tagged[:20]=%s") %(tagged[:20])

Output:

tokens[:20]=['Chapter', '3', 'A', 'town', 'is', 'a', 'thing', 'like', 'a', 'colonial', 'animal', 'A', 'town', 'has', 'a', 'nervous', 'system', 'and', 'a', 'head']
before: len(count) = 1299
filtered tokens[:20]=['Chapter', '3', 'A', 'town', 'thing', 'like', 'colonial', 'animal', 'A', 'town', 'nervous', 'system', 'head', 'shoulders', 'feet', 'A', 'town', 'thing', 'separate', 'towns']
after: len(count) = 1193
most_common = [('Kino', 77), ('And', 58), ('The', 48), ('He', 48), ('pearl', 46), ('little', 45), ('said', 38), ('could', 32), ('Juana', 31), ('Kinos', 29)]
tagged[:20]=[('Chapter', 'NN'), ('3', 'CD'), ('A', 'DT'), ('town', 'NN'), ('thing', 'NN'), ('like', 'IN'), ('colonial', 'JJ'), ('animal', 'JJ'), ('A', 'DT'), ('town', 'NN'), ('nervous', 'JJ'), ('system', 'NN'), ('head', 'NN'), ('shoulders', 'NNS'), ('feet', 'VBP'), ('A', 'DT'), ('town', 'NN'), ('thing', 'NN'), ('separate', 'JJ'), ('towns', 'NNS')]

Notice that the count has been decreases: 1299=>1193!

Removing Stop Words II - lower()

Even though we removed majority of stopwords, we still have some words in the stopwords list such as 'A'. So, we need to lower the case for all the characters of the text before we start process using lower() function.

Here is the final code for this chapter:

import nltk
from collections import Counter
import string
from nltk.corpus import stopwords

def get_tokens():
	with open('/home/k/TEST/NLTK/Pearl3.txt') as pearl:
		tokens = nltk.word_tokenize(pearl.read().lower().translate(None, string.punctuation))
	return tokens

if __name__ == "__main__":

	tokens = get_tokens()
	print("tokens[:20]=%s") %(tokens[:20])
	
	count = Counter(tokens)
	print("before: len(count) = %s") %(len(count))

	filtered = [w for w in tokens if not w in stopwords.words('english')]

	print("filtered tokens[:20]=%s") %(filtered[:20])
	
	count = Counter(filtered)
	print("after: len(count) = %s") %(len(count))
	
	print("most_common = %s") %(count.most_common(10))

	tagged = nltk.pos_tag(filtered)
	print("tagged[:20]=%s") %(tagged[:20])

Output:

tokens[:20]=['chapter', '3', 'a', 'town', 'is', 'a', 'thing', 'like', 'a', 'colonial', 'animal', 'a', 'town', 'has', 'a', 'nervous', 'system', 'and', 'a', 'head']
before: len(count) = 1238
filtered tokens[:20]=['chapter', '3', 'town', 'thing', 'like', 'colonial', 'animal', 'town', 'nervous', 'system', 'head', 'shoulders', 'feet', 'town', 'thing', 'separate', 'towns', 'two', 'towns', 'alike']
after: len(count) = 1131
most_common = [('kino', 77), ('pearl', 50), ('little', 45), ('said', 38), ('could', 32), ('juana', 31), ('kinos', 29), ('doctor', 28), ('eyes', 26), ('came', 26)]
tagged[:20]=[('chapter', 'NN'), ('3', 'CD'), ('town', 'NN'), ('thing', 'NN'), ('like', 'IN'), ('colonial', 'JJ'), ('animal', 'JJ'), ('town', 'NN'), ('nervous', 'JJ'), ('system', 'NN'), ('head', 'NN'), ('shoulders', 'NNS'), ('feet', 'VBP'), ('town', 'VBN'), ('thing', 'NN'), ('separate', 'JJ'), ('towns', 'NNS'), ('two', 'CD'), ('towns', 'NNS'), ('alike', 'IN')]