Easy Semantic Color (TF-IDF)
example:
https://gyazo.com/f5afd0e448225f72fc6b29aa1eefcdd7
1. 半角スペースで分かち書きされたデータを用意してください
code:sample.py
document_sample = [
"This is a test.",
"これ は サンプル です 。"
]
2. perform_tfidf_pca_standardization
a. TFIDF
b. PCAで2次元に削減
c. 標準化
3. polar_coordinates_to_color
a. 極座標を計算
b. 中心角に色相環をあてはめ
4. (Optional) scatter_pca
結果の表示
code:color.py
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.colors as colors
def polar_coordinates_to_color(pca_result, saturation = 0.8, brightness = 0.8):
hues = []
for x, y in pca_result:
# Calculate the angle in radians and convert it to degrees
theta = np.arctan2(y, x)
theta_degree = np.degrees(theta) % 360
# Map the angle to the hue (color)
hue = theta_degree / 360.0 # Normalize to 0, 1 for color mapping hues.append(hue)
# Convert to RGB colors
return rgb_colors
def perform_tfidf_pca_standardization(documents):
"""
Performs TF-IDF analysis on the provided documents, applies PCA to reduce to 2 dimensions,
and then standardizes the PCA result.
:param documents: A list of documents (strings)
:return: A numpy array representing the standardized 2D PCA result of the TF-IDF vectors
"""
# Step 1: TF-IDF Analysis
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
# Step 2: PCA to reduce to 2 dimensions
pca = PCA(n_components=2)
pca_result = pca.fit_transform(tfidf_matrix.toarray())
# Step 3: Standardize the PCA result
scaler = StandardScaler()
standardized_pca_result = scaler.fit_transform(pca_result)
return standardized_pca_result
def scatter_pca(pca_result):
# Convert PCA Results to Colors
colors_for_plotting = polar_coordinates_to_color(pca_result)
# Plotting the Scatter Plot
plt.figure(figsize=(8, 6))
plt.scatter(pca_result:, 0, pca_result:, 1, c=colors_for_plotting) plt.title("TF-IDF PCA Scatter Plot with Polar Coordinates Coloring")
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.show()