import pandas as pd
import numpy as np
movie_data = pd.read_csv('D:/chapter5/movie_data.csv')
movie_titles = movie_data['Title'].tolist()
movie_synopses = movie_data['Synopsis'].tolist()
print('Movie:', movie_titles[0])
print('Movie Synopsis:', movie_synopses[0][:1000])
import nltk
import re
import string
stopword_list=nltk.corpus.stopwords.words('english')
def tokenize_text(text):
tokens=nltk.word_tokenize(text)
tokens=[token.strip() for token in tokens]
return tokens
def remove_stopwords(text):
tokens=tokenize_text(text)
filtered_tokens=[token for token in tokens if token not in stopword_list]
filtered_text=' '.join(filtered_tokens)
return filtered_text
def remove_special_characters(text):
tokens=tokenize_text(text)
pattern=re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens=filter(None,[re.sub(pattern,'',token) for token in tokens])
filtered_text=' '.join(filtered_tokens)
return filtered_text
def text_normalize(corpus,tokenize=False,lemmatize=True,stopword=True):
normalized_corpus=[]
for text in corpus:
text=text.lower()
if lemmatize:
text=lemmatize_text(text)
text=remove_special_characters(text)
if stopword:
text=remove_stopwords(text)
if tokenize:
text=tokenize_text(text)
normalized_corpus.append(text)
return normalized_corpus
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def build_feature_matrix(documents, feature_type='frequency',
ngram_range=(1, 1), min_df=0.0, max_df=1.0):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df,
ngram_range=ngram_range)
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
# normalize corpus
norm_movie_synopses = text_normalize(movie_synopses,lemmatize=False)
# extract tf-idf features
vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses,
feature_type='tfidf',
min_df=0.24, max_df=0.85,
ngram_range=(1, 2))
# view number of features
print(feature_matrix.shape)
'''# get feature names
feature_names = vectorizer.get_feature_names()
# print sample features
print(feature_names[:20])
#kmeans
from sklearn.cluster import KMeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters,max_iter=10000)
km.fit(feature_matrix)
clusters = km.labels_
movie_data['Cluster'] = clusters
from collections import Counter
# get the total number of movies per cluster
c = Counter(clusters)
print(c.items())
def get_cluster_data(clustering_obj, movie_data,
feature_names, num_clusters,
topn_features=10):
cluster_details = {}
# get cluster centroids
ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
# get key features for each cluster
# get movies belonging to each cluster
for cluster_num in range(num_clusters):
cluster_details[cluster_num] = {}
cluster_details[cluster_num]['cluster_num'] = cluster_num
key_features = [feature_names[index]
for index
in ordered_centroids[cluster_num, :topn_features]]
cluster_details[cluster_num]['key_features'] = key_features
movies = movie_data[movie_data['Cluster'] == cluster_num]['Title'].values.tolist()
cluster_details[cluster_num]['movies'] = movies
return cluster_details
def print_cluster_data(cluster_data):
# print cluster details
for cluster_num, cluster_details in cluster_data.items():
print(cluster_num)
print('-'*20)
print('Key features:', cluster_details['key_features'])
print('Movies in this cluster:')
print(', '.join(cluster_details['movies']))
print('='*40)
cluster_data = get_cluster_data(clustering_obj=km,
movie_data=movie_data,
feature_names=feature_names,
num_clusters=num_clusters,
topn_features=5)
print_cluster_data(cluster_data) '''
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
def ward_hierarchical_clustering(feature_matrix):
cosine_distance = 1 - cosine_similarity(feature_matrix)
linkage_matrix = ward(cosine_distance)
return linkage_matrix
def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
# set size
fig, ax = plt.subplots(figsize=figure_size)
movie_titles = movie_data['Title'].values.tolist()
# plot dendrogram
ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
plt.show()
# build ward's linkage matrix
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
# plot the dendrogram
plot_hierarchical_clusters(linkage_matrix=linkage_matrix,
movie_data=movie_data,
figure_size=(8,10))