spell = SpellChecker() def correct_spellings(text): corrected_text = [] misspelled_words = spell.unknown(text.split()) for word in text.split(): if word in misspelled_words: corrected_text.append(spell.correction(word)) else: corrected_text.append(word) return " ".join(corrected_text) text = "corect me plese" correct_spellings(text)
plt.figure(figsize=(10,5)) corpus = create_corpus(1)
dic = defaultdict(int) import string special = string.punctuation for i in (corpus): if i in special: dic[i] += 1 x,y = zip(*dic.items()) plt.bar(x,y)
[結果]
災害に関係のないツイートの句読点出現回数をグラフで表示します。
[ソース]
1 2 3 4 5 6 7 8 9 10 11 12 13
plt.figure(figsize=(10,5)) corpus = create_corpus(0)
dic = defaultdict(int) import string special = string.punctuation for i in (corpus): if i in special: dic[i] += 1 x, y = zip(*dic.items()) plt.bar(x,y,color='green')
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5)) word=tweet[tweet['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x]) sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red') ax1.set_title('disaster') word=tweet[tweet['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x]) sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green') ax2.set_title('Not disaster') fig.suptitle('Average word length in each tweet')
import os import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from nltk.corpus import stopwords from nltk.util import ngrams from sklearn.feature_extraction.text import CountVectorizer from collections import defaultdict from collections import Counter plt.style.use('ggplot') stop=set(stopwords.words('english')) import re from nltk.tokenize import word_tokenize import gensim import string from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from tqdm import tqdm from keras.models import Sequential from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D from keras.initializers import Constant from sklearn.model_selection import train_test_split from keras.optimizers import Adam
# ライブラリをインポート import numpy as np import pandas as pd import matplotlib.pyplot as plt import os from tqdm import tqdm import seaborn as sns from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings("ignore")
import torch import torch.nn as nn import torchvision from torch.utils.data import DataLoader, Dataset from torchvision import transforms
# 学習率を更新する def update_lr(optimizer, lr): for param_group in optimizer.param_groups: param_group['lr'] = lr
# 学習する total_step = len(train_loader) curr_lr = learning_rate for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device)
# Forward pass(順伝搬:初期の入力を層ごとに処理して出力に向けて送ること) outputs = model(images) loss = criterion(outputs, labels.flatten())
# Backward and optimize(逆伝播と最適化を行う) optimizer.zero_grad() loss.backward() optimizer.step()
if (i + 1) % 300 == 0: print(f'Epoch: {epoch + 1}/{num_epochs}, Loss: {loss.item()}')
# 評価する model.eval() with torch.no_grad(): correct = 0 total = 0 for images, labels in val_loader: images = images.to(device) labels = labels.to(device)
import numpy as np import pandas as pd import matplotlib.pyplot as plt import os from tqdm import tqdm from sklearn.model_selection import train_test_split import seaborn as sns import warnings warnings.filterwarnings("ignore")
import torch import torch.nn as nn from torch.autograd import Variable import torchvision
CSVファイル読み込み・正解ラベルの準備
CSVファイルを読み込み、正解ラベルのデータを分けます。
正解ラベルはNumpyのint32型に変換します。
[ソース]
1 2 3 4
data = pd.read_csv('../input/digit-recognizer/train.csv', dtype = np.float32) labels = data.pop('label').astype('int32')
data.head() # let's see first five rows
[結果]
変換した正解ラベルを表示します。
[ソース]
1
labels.head() # after converting labels to `int32`
[結果]
次に0~255のピクセルデータを0~1に変換します。(正規化)
正解ラベルと0~1に変換したデータはNumpyの配列型にしておきます。
また、訓練データと評価データに分けます。(4:1の割合)
[ソース]
1 2 3 4 5
data = data.to_numpy() / 255.0 # converting to numpy and normalizing between 0 and 1 labels = labels.to_numpy()
plt.figure(figsize=(12, 10)) for i in range(16): plt.subplot(4, 4,i+1) plt.xticks([]) plt.yticks([]) plt.imshow(x_train[i].reshape(28,28), cmap=plt.cm.binary) plt.xlabel(y_train[i]) plt.show()
# Train the model() total_step = len(train_loader) for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): # (訓練データごとにループ) images = images.view(32, 1, 28, 28).to(device) labels = labels.to(device)
# Forward pass(順伝搬:初期の入力を層ごとに処理して出力に向けて送ること) outputs = model(images) loss = criterion(outputs, labels)
# Backward and optimize(最適化、確率的勾配降下法) optimizer.zero_grad() loss.backward() optimizer.step()
iter_num += 1
if iter_num % 50 == 0: # Calculate Accuracy(正解率の算出) correct = 0 total = 0 # Iterate through val dataset(評価データごとにループ) for images, labels in val_loader: test = images.view(1, 1, 28, 28).to(device) labels = labels.to(device) # Forward propagation(順伝搬:初期の入力を層ごとに処理して出力に向けて送ること) outputs = model(test) # Get predictions from the maximum value(予測値の取得) predicted = torch.max(outputs.data, 1)[1] # Total number of labels(正解ラベルの総数) total += len(labels) correct += (predicted == labels).sum() accuracy = 100 * correct / float(total)
# store loss and iteration(損失・正解率を保持) loss_list.append(loss.data) iteration_list.append(iter_num) accuracy_list.append(accuracy) if iter_num % 500 == 0: # Print Loss(損失と正解率を表示) print('Iteration: {}, Loss: {:.4f}, Accuracy: {:.4f} %'.format(iter_num, loss.data, accuracy))
class LogisticRegression(nn.Module): def __init__(self, input_size, output_size): super(LogisticRegression, self).__init__() # Linear model self.linear = nn.Linear(input_size, output_size)
def forward(self, x): out = self.linear(x) return out
model = LogisticRegression(input_size, output_size).to(device)
# Visualize loss(損失のグラフ化) plt.plot(iteration_list,loss_list) plt.xlabel("Num. of Iters.") plt.ylabel("Loss") plt.title("Logistic Regression: Loss vs Num. of Iters.") plt.show()
# Visualize accuracy(正解率のグラフ化) plt.plot(iteration_list,accuracy_list) plt.xlabel("Num. of Iters.") plt.ylabel("Accuracy") plt.title("Logistic Regression: Accuracy vs Num. of Iters.") plt.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import os from tqdm import tqdm import seaborn as sns from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings("ignore")
import torch import torch.nn as nn import torchvision from torch.utils.data import TensorDataset, DataLoader, Dataset
plt.figure(figsize=(12, 10)) for i in range(20): plt.subplot(5, 4, i+1) plt.grid(False) plt.xticks([]) plt.yticks([]) plt.imshow(data[i].reshape(28,28))
# 名前からダブルクォーテーションを除外してるだけ for i, name in enumerate(test_data_with_labels['name']): if '"' in name: test_data_with_labels['name'][i] = re.sub('"', '', name) # 名前からダブルクォーテーションを除外してるだけ for i, name in enumerate(test_data['Name']): if '"' in name: test_data['Name'][i] = re.sub('"', '', name)
survived = []
for name in test_data['Name']: # values[-1]は同姓同名の場合、最後にヒットしたほうの生存結果を取得している。 survived.append(int(test_data_with_labels.loc[test_data_with_labels['name'] == name]['survived'].values[-1]))