


比如我们先看看 01-Harry Potter and the Sorcerer's Stone.txt" 里的章节情况,我们打开txt




import re

import nltk

raw_text = open("data/01-Harry Potter and the Sorcerer's Stone.txt").read

pattern = 'Chapter d+n[a-zA-Z ]+n'

re.findall(pattern, raw_text)

['Chapter 1nThe Boy Who Livedn',

'Chapter 2nThe Vanishing Glassn',

'Chapter 3nThe Letters From No Onen',

'Chapter 4nThe Keeper Of The Keysn',

'Chapter 5nDiagon Alleyn',

'Chapter 7nThe Sorting Hatn',

'Chapter 8nThe Potions Mastern',

'Chapter 9nThe Midnight Dueln',

'Chapter 10nHalloweenn',

'Chapter 11nQuidditchn',

'Chapter 12nThe Mirror Of Erisedn',

'Chapter 13nNicholas Flameln',

'Chapter 14nNorbert the Norwegian Ridgebackn',

'Chapter 15nThe Forbidden Forestn',

'Chapter 16nThrough the Trapdoorn',

'Chapter 17nThe Man With Two Facesn']


import re

test = """Chapter 1nThe Boy Who LivednMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.nMr. Dursley was the director of a firm called Grunnings,

Chapter 2nThe Vanishing GlassnFor a second, Mr. Dursley didn’t realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn’t a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat.

Chapter 3nThe Letters From No OnenThe traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.nMr. Dursley always sat with his back to the window in his office on the ninth floor. If he hadn’t, he might have found it harder to concentrate on drills that morning.

Chapter 4nThe Keeper Of The KeysnHe didn’t know why, but they made him uneasy. This bunch were whispering excitedly, too, and he couldn’t see a single collecting tin.

Chapter 5nDiagon AlleynIt was a few seconds before Mr. Dursley realized that the man was wearing a violet cloak. """



chapter_contents = [c for c in re.split('Chapter d+n[a-zA-Z ]+n', test) if c]


['Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.nMr. Dursley was the director of a firm called Grunnings,n ',

'For a second, Mr. Dursley didn’t realize what he had seen — then he jerked his head around to look again. There was a tabby cat standing on the corner of Privet Drive, but there wasn’t a map in sight. What could he have been thinking of? It must have been a trick of the light. Mr. Dursley blinked and stared at the cat.n ',

'The traffic moved on and a few minutes later, Mr. Dursley arrived in the Grunnings parking lot, his mind back on drills.nMr. Dursley always sat with his back to the window in his office on the ninth floor. If he hadn’t, he might have found it harder to concentrate on drills that morning.n ',

'He didn’t know why, but they made him uneasy. This bunch were whispering excitedly, too, and he couldn’t see a single collecting tin. n ',

'It was a few seconds before Mr. Dursley realized that the man was wearing a violet cloak. ']




import os

import re

import matplotlib.pyplot as plt

colors = ['#78C850', '#A8A878','#F08030','#C03028','#6890F0', '#A890F0','#A040A0']

harry_potters = ["Harry Potter and the Sorcerer's Stone.txt",

"Harry Potter and the Chamber of Secrets.txt",

"Harry Potter and the Prisoner of Azkaban.txt",

"Harry Potter and the Goblet of Fire.txt",

"Harry Potter and the Order of the Phoenix.txt",

"Harry Potter and the Half-Blood Prince.txt",

"Harry Potter and the Deathly Hallows.txt"]


harry_potter_names = [n.replace('Harry Potter and the ', '')[:-4]

for n in harry_potters]


chapter_nums = []

for harry_potter in harry_potters:

file = "data/"+harry_potter

raw_text = open(file).read

pattern = 'Chapter d+n[a-zA-Z ]+n'

chapter_contents = [c for c in re.split(pattern, raw_text) if c]



plt.figure(figsize=(20, 10))


plt.title('Chapter Number of Harry Potter', fontsize=25, weight='bold')


plt.bar(harry_potter_names, chapter_nums, color=colors)


plt.xticks(rotation=25, fontsize=16, weight='bold')

plt.yticks(fontsize=16, weight='bold')


plt.xlabel('Harry Potter Series', fontsize=20, weight='bold')

plt.ylabel('Chapter Number', rotation=25, fontsize=20, weight='bold')






import os

import re

import matplotlib.pyplot as plt

from nltk import word_tokenize

from nltk.stem.snowball importSnowballStemmer


colors = ['#78C850', '#A8A878','#F08030','#C03028','#6890F0', '#A890F0','#A040A0']

harry_potters = ["Harry Potter and the Sorcerer's Stone.txt",

"Harry Potter and the Chamber of Secrets.txt",

"Harry Potter and the Prisoner of Azkaban.txt",

"Harry Potter and the Goblet of Fire.txt",

"Harry Potter and the Order of the Phoenix.txt",

"Harry Potter and the Half-Blood Prince.txt",

"Harry Potter and the Deathly Hallows.txt"]


harry_potter_names = [n.replace('Harry Potter and the ', '')[:-4]

for n in harry_potters]


richness_of_words = []

stemmer = SnowballStemmer("english")

for harry_potter in harry_potters:

file = "data/"+harry_potter

raw_text = open(file).read

words = word_tokenize(raw_text)

words = [stemmer.stem(w.lower) for w in words]

wordset = set(words)

richness = len(words)/len(wordset)



plt.figure(figsize=(20, 10))


plt.title('The Richness of Word in Harry Potter', fontsize=25, weight='bold')


plt.bar(harry_potter_names, richness_of_words, color=colors)


plt.xticks(rotation=25, fontsize=16, weight='bold')

plt.yticks(fontsize=16, weight='bold')


plt.xlabel('Harry Potter Series', fontsize=20, weight='bold')

plt.ylabel('Richness of Words', rotation=25, fontsize=20, weight='bold')







compound: 综合情感得分

from vaderSentiment.vaderSentiment importSentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer

test = 'i am so sorry'


{'neg': 0.443, 'neu': 0.557, 'pos': 0.0, 'compound': -0.1513}

import os

import re

import matplotlib.pyplot as plt

from nltk.tokenize import sent_tokenize

from vaderSentiment.vaderSentiment importSentimentIntensityAnalyzer

harry_potters = ["Harry Potter and the Sorcerer's Stone.txt",

"Harry Potter and the Chamber of Secrets.txt",

"Harry Potter and the Prisoner of Azkaban.txt",

"Harry Potter and the Goblet of Fire.txt",

"Harry Potter and the Order of the Phoenix.txt",

"Harry Potter and the Half-Blood Prince.txt",

"Harry Potter and the Deathly Hallows.txt"]


chapter_indexes = []


compounds = []

analyzer = SentimentIntensityAnalyzer

chapter_index = 1

for harry_potter in harry_potters:

file = "data/"+harry_potter

raw_text = open(file).read

pattern = 'Chapter d+n[a-zA-Z ]+n'

chapters = [c for c in re.split(pattern, raw_text) if c]


for chapter in chapters:

compound = 0

sentences = sent_tokenize(chapter)

for sentence in sentences:

score = analyzer.polarity_scores(sentence)

compound += score['compound']





plt.figure(figsize=(20, 10))


plt.title('Average Sentiment of the Harry Potter', fontsize=25, weight='bold')


plt.plot(chapter_indexes, compounds, color='#A040A0')


plt.xticks(rotation=25, fontsize=16, weight='bold')

plt.yticks(fontsize=16, weight='bold')


plt.xlabel('Chapter', fontsize=20, weight='bold')

plt.ylabel('Average Sentiment', rotation=25, fontsize=20, weight='bold')



import numpy as np

import os

import re

import matplotlib.pyplot as plt

from nltk.tokenize import sent_tokenize

from vaderSentiment.vaderSentiment importSentimentIntensityAnalyzer


def movingaverage(value_series, window_size):

window = np.ones(int(window_size))/float(window_size)

return np.convolve(value_series, window, 'same')

harry_potters = ["Harry Potter and the Sorcerer's Stone.txt",

"Harry Potter and the Chamber of Secrets.txt",

"Harry Potter and the Prisoner of Azkaban.txt",

"Harry Potter and the Goblet of Fire.txt",

"Harry Potter and the Order of the Phoenix.txt",

"Harry Potter and the Half-Blood Prince.txt",

"Harry Potter and the Deathly Hallows.txt"]


chapter_indexes = []


compounds = []

analyzer = SentimentIntensityAnalyzer

chapter_index = 1

for harry_potter in harry_potters:

file = "data/"+harry_potter

raw_text = open(file).read

pattern = 'Chapter d+n[a-zA-Z ]+n'

chapters = [c for c in re.split(pattern, raw_text) if c]


for chapter in chapters:

compound = 0

sentences = sent_tokenize(chapter)

for sentence in sentences:

score = analyzer.polarity_scores(sentence)

compound += score['compound']





plt.figure(figsize=(20, 10))


plt.title('Average Sentiment of the Harry Potter',




plt.plot(chapter_indexes, compounds,


plt.plot(movingaverage(compounds, 10),













plt.ylabel('Average Sentiment',











