テーブル
import pandas as pd
df = pd.read_csv(INPUT_DATA_PATH_DIR + 'train.csv')
df.columns
import pandas_profiling as pdp
pdp.ProfileReport(df)
df_gb_label_group = pd.DataFrame({"count": df.groupby("label_group").size()})
posting_id = "AAABBBCCC"
image = df.query('posting_id == @posting_id')['image'].iloc[-1]
for index, raw in df.iterrows():
sentence = raw["title"]
print(sentence)
dimension = len(feature_texts[0])
nlist = min(100, len(feature_texts))
quantiser = faiss.IndexFlatL2(dimension)
faiss_index = faiss.IndexIVFFlat(quantiser, dimension, nlist, faiss.METRIC_L2)
faiss_index.train(feature_texts)
faiss_index.add(feature_texts)
faiss_index.nprobe = 10
s = time.time()
distance_similar_texts, idx_similar_texts = faiss_index.search(feature_texts, 3)
e = time.time()
print("search time: {}".format(e-s))
テキスト
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = FEATURE_TEXTS_DIM)
feature_texts = model.fit_transform(df['title']).toarray()
画像
import pytesseract
result = pytesseract.image_to_string('./image_path.csv', lang="eng", config='--psm 3')
import numpy as np
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
%matplotlib inline
im = Image.open(INPUT_DATA_PATH_DIR + "train_images/" + image, 'r')
im_list = np.asarray(im)
plt.imshow(im_list)
print(image)
plt.show()
その他
from cuml import PCA
from cuml.neighbors import NearestNeighbors