テーブル
- pandas csv読み込み
import pandas as pd df = pd.read_csv(INPUT_DATA_PATH_DIR + 'train.csv') df.columns
- pandas EDA
import pandas_profiling as pdp pdp.ProfileReport(df)
df_gb_label_group = pd.DataFrame({"count": df.groupby("label_group").size()})
- pandas 検索
posting_id = "AAABBBCCC" image = df.query('posting_id == @posting_id')['image'].iloc[-1]
- pandas for文
for index, raw in df.iterrows(): sentence = raw["title"] print(sentence)
- ベクトル探索 faiss
# faissインデックス作成 dimension = len(feature_texts[0]) nlist = min(100, len(feature_texts)) quantiser = faiss.IndexFlatL2(dimension) faiss_index = faiss.IndexIVFFlat(quantiser, dimension, nlist, faiss.METRIC_L2) # faissインデックスの学習・追加 faiss_index.train(feature_texts) faiss_index.add(feature_texts) # 近傍探索 faiss_index.nprobe = 10 s = time.time() distance_similar_texts, idx_similar_texts = faiss_index.search(feature_texts, 3) e = time.time() print("search time: {}".format(e-s))
- グラフ
テキスト
tf-idf
# from cuml.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = FEATURE_TEXTS_DIM) feature_texts = model.fit_transform(df['title']).toarray()
- BERT
- shopee
画像
resnet
- shopee
OCR tesseract
import pytesseract # バッチ実行 result = pytesseract.image_to_string('./image_path.csv', lang="eng", config='--psm 3')
- 画像表示
import numpy as np from matplotlib.pyplot import imshow import matplotlib.pyplot as plt %matplotlib inline #画像の読み込み im = Image.open(INPUT_DATA_PATH_DIR + "train_images/" + image, 'r') #画像をarrayに変換 im_list = np.asarray(im) #貼り付け plt.imshow(im_list) #表示 print(image) plt.show()
その他
- chunk
- cudf
- cuml
from cuml import PCA from cuml.neighbors import NearestNeighbors