import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_predictQ = {'What does the "yield" keyword do in Python?': ['python'],'What is a metaclass in Python?': ['oop'],'How do I check whether a file exists using Python?': ['python'],'How to make a chain of function decorators?': ['python', 'decorator'],'Using i and j as variables in Matlab': ['matlab', 'naming-conventions'],'MATLAB: get variable type': ['matlab'],'Why is MATLAB so fast in matrix multiplication?': ['performance'],'Is MATLAB OOP slow or am I doing something wrong?': ['matlab-oop'],}
dataframe = pd.DataFrame({'body': Q.keys(), 'tag': Q.values()})    mlb = MultiLabelBinarizer()
X = dataframe['body'].values
y = mlb.fit_transform(dataframe['tag'].values)classifier = Pipeline([('vectorizer', CountVectorizer(lowercase=True, stop_words='english', max_df=0.8, min_df=1)),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(LinearSVC()))])predicted = cross_val_predict(classifier, X, y)
import numpy as np
np.set_printoptions(precision=2, threshold=1000)
array([[0, 0, 0, 0, 0, 0, 1],[0, 0, 0, 0, 0, 0, 1],[0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0],[0, 0, 0, 0, 0, 0, 0],[0, 1, 0, 0, 0, 0, 0],[0, 1, 0, 0, 0, 0, 0]])


import warnings
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import ShuffleSplitrs = ShuffleSplit(n_splits=1, test_size=.5, random_state=0)
for train_index, test_index in rs.split(X):train_indices, test_indices = train_index, test_indexprint("TRAIN:", train_index, "TEST:", test_index)with warnings.catch_warnings(record=True) as received_warnings:warnings.simplefilter("always")X_train, y_train = X[train_indices], y[train_indices]X_test, y_test = X[test_indices], y[test_indices], y_train)predicted_test = classifier.predict(X_test)for w in received_warnings:print (w.message)
TRAIN: [3 0 5 4] TEST: [6 2 1 7]
Label not 2 is present in all training examples.
Label not 4 is present in all training examples.
Label not 5 is present in all training examples.



array([[1, 0, 0, 0, 0, 0, 1],[0, 0, 0, 0, 0, 0, 1],[0, 1, 0, 0, 0, 0, 0],[0, 1, 0, 1, 0, 0, 0]])



def get_best_tags(clf, X, lb, n_tags=3):decfun = clf.decision_function(X)best_tags = np.argsort(decfun)[:, :-(n_tags+1): -1]return lb.classes_[best_tags]


get_best_tags(classifier, X_test, mlb)
array([['matlab', 'performance', 'oop'],['python', 'performance', 'oop'],['python', 'performance', 'oop'],['matlab', 'performance', 'oop']], dtype=object)


参考:UserWarning: Label not :NUMBER: is present in all training examples

