香港中文大學數學系
2026年6月27日
# ==============================================================================
# 為自訂的二維合成數據集(N=7 個樣本)逐步實作 PCA
# 完全對應所提供 LaTeX 文件中的手動算術計算
# 圖表:1) 原始輸入散佈圖(在所有計算之前)
# 2) 協方差特徵向量軸圖 + 投影的 1D PCA 結果(計算之後)
# 所有中間輸出結果都會印出至終端機,以便手動驗證答案
# ==============================================================================
import numpy as np
import matplotlib.pyplot as plt
# ------------------------------------------------------
# 1. 定義原始輸入數據集(與 LaTeX 中的表格相符)
# 列 (Rows) = N=7 個觀測值,欄 (Columns) = x 特徵,y 特徵
# ------------------------------------------------------
# 來自論文表格的原始 (x, y) 配對數據
raw_data = np.array([
[2.5, 2.4],
[0.5, 0.7],
[2.2, 2.9],
[1.9, 2.2],
[3.1, 3.0],
[2.3, 2.7],
[2.0, 1.6]
])
N = raw_data.shape[0] # 總樣本數 N=7
d = raw_data.shape[1] # 特徵維度 d=2
print("=" * 60)## ============================================================
## RAW INPUT DATASET (N=7 samples, 2 features)
## [[2.5 2.4]
## [0.5 0.7]
## [2.2 2.9]
## [1.9 2.2]
## [3.1 3. ]
## [2.3 2.7]
## [2. 1.6]]
## ============================================================
# --------------------------
# 圖表 1:原始數據視覺化(在任何 PCA 計算前顯示)
# --------------------------
plt.figure(figsize=(6, 5))
plt.scatter(raw_data[:, 0], raw_data[:, 1], color="darkblue", s=60, label="Raw (x,y) Samples")
plt.xlabel("x Feature")
plt.ylabel("y Feature")
plt.title("Raw 2D Input Dataset (Pre-PCA Visualization)")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()# ------------------------------------------------------
# 步驟 1:平均值中心化(減去各特徵的平均值)
# 計算特徵平均值 x̄, ȳ,然後對所有點進行中心化
# ------------------------------------------------------
# 計算每個特徵的樣本平均值(逐欄平均)
feature_means = np.mean(raw_data, axis=0)
x_mean = feature_means[0]
y_mean = feature_means[1]
# 印出平均值以交叉核對 LaTeX 手動計算結果
print(f"\nFeature x mean x̄ = {x_mean:.6f} (Manual ref: 2.0714)")##
## Feature x mean x̄ = 2.071429 (Manual ref: 2.0714)
## Feature y mean ȳ = 2.214286 (Manual ref: 2.2142)
# 均值中心化的偏差矩陣 X_centered = raw - mean
X_centered = raw_data - feature_means
print("\nMEAN-CENTERED DEVIATION MATRIX [X = x_i-x̄ , Y = y_i-ȳ]")##
## MEAN-CENTERED DEVIATION MATRIX [X = x_i-x̄ , Y = y_i-ȳ]
## [[ 0.4286 0.1857]
## [-1.5714 -1.5143]
## [ 0.1286 0.6857]
## [-0.1714 -0.0143]
## [ 1.0286 0.7857]
## [ 0.2286 0.4857]
## [-0.0714 -0.6143]]
# ------------------------------------------------------
# 步驟 2:計算無偏樣本協方差矩陣
# 公式:Σ = 1/(N-1) * X_centered^T @ X_centered
# 分母 N-1 = 6,用於無偏樣本協方差
# ------------------------------------------------------
cov_matrix = np.cov(X_centered, rowvar=False) # rowvar=False: 欄 = 特徵
print("\nSAMPLE COVARIANCE MATRIX Σ (Unbiased, N-1=6)")##
## SAMPLE COVARIANCE MATRIX Σ (Unbiased, N-1=6)
## [[0.63571429 0.58547619]
## [0.58547619 0.67142857]]
## Matches manual result: [[0.63571429, 0.58547619],[0.58547619, 0.67142857]]
# ------------------------------------------------------
# 步驟 3:協方差矩陣的特徵分解
# 解 Σu = λu;將特徵值按降序排列
# ------------------------------------------------------
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
# 將特徵對 (eigenpairs) 從最大 λ 排列到最小 λ
sort_idx = np.argsort(eigenvalues)[::-1]
sorted_eig_vals = eigenvalues[sort_idx]
sorted_eig_vecs = eigenvectors[:, sort_idx]
# 提取 PC1 特徵向量(最大特徵值對應的基底向量)
pc1_vector = sorted_eig_vecs[:, 0]
lambda1, lambda2 = sorted_eig_vals[0], sorted_eig_vals[1]
print("\nSORTED EIGENVALUES (Descending λ1, λ2)")##
## SORTED EIGENVALUES (Descending λ1, λ2)
## λ1 = 1.23931988+0.00000000j, λ2 = 0.06782298+0.00000000j
## Manual reference: λ1=1.23931988, λ2=0.06782298
##
## ORTHONORMAL EIGENVECTOR MATRIX U (Columns = PC1, PC2)
## [[-0.69624492+0.j -0.7178043 +0.j]
## [-0.7178043 +0.j 0.69624492+0.j]]
# ------------------------------------------------------
# 步驟 4:計算解釋變異數比率 (Explained Variance Ratios)
# EV_k = λ_k / (λ1 + λ2),總和 λ1+λ2 = 數據集總變異數
# ------------------------------------------------------
total_variance = lambda1 + lambda2
ev_ratio1 = lambda1 / total_variance
ev_ratio2 = lambda2 / total_variance
print("\nEXPLAINED VARIANCE RATIOS")##
## EXPLAINED VARIANCE RATIOS
## Total Variance Sum λ1+λ2 = 1.30714286+0.00000000j
## EV1 (PC1) = 0.94811357+0.00000000j (~94.81%)
## EV2 (PC2) = 0.05188643+0.00000000j (~5.19%)
## Sum EV1+EV2 = 1.0+0.0j (must equal 1)
# ------------------------------------------------------
# 步驟 5:將中心化數據投影到 PC1 基底,獲得一維降維數據
# 投影公式:Z = X_centered @ pc1_vector
# 等同於 Z = u1^T * X_centered.T(與 LaTeX 數學符號相符)
# ------------------------------------------------------
Z_1d = X_centered @ pc1_vector
print("\nFINAL 1D PCA PROJECTION VALUES (PC1 Compressed Dataset)")##
## FINAL 1D PCA PROJECTION VALUES (PC1 Compressed Dataset)
## [-0.431697+0.j 2.18106 +0.j -0.581726+0.j 0.129611+0.j -1.280127+0.j
## -0.507789+0.j 0.490669+0.j]
## Matches manual table of PC1 projected values in LaTeX
# --------------------------
# 圖表 2:PCA 結果視覺化
# 子圖 1:中心化數據 + PC1 / PC2 特徵向量軸
# 子圖 2:一維投影 PC1 數值分佈
# --------------------------
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# 子圖 1:中心化數據與主成分軸
ax1.scatter(X_centered[:, 0], X_centered[:, 1], c="crimson", s=60, label="Mean-Centered Samples")
# 繪製 PC1 特徵向量線
scale = 1.2
ax1.plot([0, pc1_vector[0]*scale], [0, pc1_vector[1]*scale], c="black", lw=2.5, label="PC1 Axis (94.81% variance)")
# 繪製 PC2 特徵向量線
pc2_vector = sorted_eig_vecs[:, 1]
ax1.plot([0, pc2_vector[0]*scale], [0, pc2_vector[1]*scale], c="gray", lw=2, linestyle="--", label="PC2 Axis (5.19% variance)")
ax1.axhline(y=0, color="k", alpha=0.2)
ax1.axvline(x=0, color="k", alpha=0.2)
ax1.set_xlabel("Centered X")
ax1.set_ylabel("Centered Y")
ax1.set_title("Centered Data + PCA Principal Component Axes")
ax1.grid(True, alpha=0.3)
ax1.legend()
# 子圖 2:一維投影 PC1 數值分佈
# sample_indices = np.arange(1, N+1)
# ax2.scatter(Z_1d, sample_indices, c="darkgreen", s=60)
# ax2.set_xlabel("PC1 Projected 1D Value")
# ax2.set_ylabel("Sample Index (1 to 7)")
# ax2.set_title("Reduced 1D Dataset After PCA Projection")
# ax2.grid(True, alpha=0.3)
# plt.tight_layout()
# plt.show()
# 為原始的 7 個樣本建立索引標籤 (1,2,3,4,5,6,7)
sample_indices = np.arange(1, N+1)
# 散佈圖:水平軸 = 壓縮後的一維 PCA 數值 Z_1d;垂直軸 = 原始樣本 ID
ax2.scatter(Z_1d, sample_indices, c="darkgreen", s=60)
# 坐標軸標籤,增加可讀性
ax2.set_xlabel("PC1 Projected 1D Value")
ax2.set_ylabel("Sample Index (1 to 7)")
# 圖表標題說明核心輸出:從 2D → 1D 的降維
ax2.set_title("Reduced 1D Dataset After PCA Projection")
ax2.grid(True, alpha=0.3)
# 自動調整子圖間距
plt.tight_layout()
# 渲染合併後的圖形(中心化數據軸 + 此一維投影面板)
plt.show()# ==============================================================================
# 答案驗證總結區塊
# ==============================================================================
print("\n" + "="*60)##
## ============================================================
print("VERIFICATION COMPLETE: All intermediate values match manual PCA calculation in the LaTeX document")## VERIFICATION COMPLETE: All intermediate values match manual PCA calculation in the LaTeX document
## 1. Raw data plot shown before all PCA computation
## 2. Post-PCA figures generated with centered data axes & 1D projection distribution
## ============================================================
# ==============================================================================
# 4 變數電腦偏好李克特量表 (Likert Survey) 的完整 PCA 流程
# N=16 名受訪者,p=4 個特徵:價格 (Price)、軟體 (Software)、外觀 (Aesthetics)、品牌 (Brand)
# 流程與參考的 SageMath 程式碼完全對應
# 視覺化拆分:
# 1) 計算前的成對原始數據圖(在 PCA 數學運算前顯示)
# 2) 計算後的雙圖:陡坡變異數折線圖 (Scree plot) + 2D PC1/PC2 散佈圖
# 印出所有中間矩陣與指標以供手動交叉檢查
# ==============================================================================
import numpy as np
import matplotlib.pyplot as plt
# ------------------------------------------------------------------------------
# 步驟 1:定義原始問卷數據矩陣(完全複製 SageMath 的輸入)
# 列 = 個別受訪者,欄 = [價格, 軟體, 外觀, 品牌]
# 李克特量表:1 = 非常不同意,7 = 非常同意
# ------------------------------------------------------------------------------
raw_X = np.array([
[6, 5, 3, 4],
[7, 3, 2, 2],
[6, 4, 4, 5],
[5, 7, 1, 3],
[7, 7, 5, 5],
[6, 4, 2, 3],
[5, 7, 2, 1],
[6, 5, 4, 4],
[3, 5, 6, 7],
[1, 3, 7, 5],
[2, 6, 6, 7],
[5, 7, 7, 6],
[2, 4, 5, 6],
[3, 5, 6, 5],
[1, 6, 5, 5],
[2, 3, 7, 7]
])
n_samples, p_features = raw_X.shape
feature_names = ["Price", "Software", "Aesthetics", "Brand"]
# 印出原始輸入矩陣以供手動驗證
print("=" * 72)## ========================================================================
## RAW SURVEY DATA MATRIX X (16 samples × 4 features)
## Feature Column Order: ['Price', 'Software', 'Aesthetics', 'Brand']
## [[6 5 3 4]
## [7 3 2 2]
## [6 4 4 5]
## [5 7 1 3]
## [7 7 5 5]
## [6 4 2 3]
## [5 7 2 1]
## [6 5 4 4]
## [3 5 6 7]
## [1 3 7 5]
## [2 6 6 7]
## [5 7 7 6]
## [2 4 5 6]
## [3 5 6 5]
## [1 6 5 5]
## [2 3 7 7]]
##
## Sample count n = 16, Feature dimension p = 4
## ========================================================================
# --------------------------
# 圖表 1:PCA 前的成對原始視覺化(在所有 PCA 計算前渲染)
# 4x4 網格:對角線 = 單一特徵直方圖,非對角線 = 特徵散佈圖
# --------------------------
fig_raw, axes_raw = plt.subplots(p_features, p_features, figsize=(10, 10))
fig_raw.suptitle("Raw 4D Survey Data Pairwise Preview (Pre-PCA Visualization)", fontsize=14)
for i in range(p_features):
for j in range(p_features):
ax = axes_raw[i, j]
if i != j:
ax.scatter(raw_X[:, j], raw_X[:, i], c="navy", s=30, alpha=0.7)
else:
ax.hist(raw_X[:, i], bins=range(0, 9), color="steelblue", alpha=0.6)
ax.set_xlabel(feature_names[j], fontsize=8)
ax.set_ylabel(feature_names[i], fontsize=8)
ax.tick_params(axis="both", labelsize=6)
ax.grid(True, alpha=0.2)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()# ------------------------------------------------------------------------------
# 步驟 2:標準化(Z 分數中心化 + 縮放,複製 Sage 的巢狀 for 迴圈)
# 標準化項目的公式:X_ctr[row,col] = (X[row,col] - column_mean) / column_std
# 母體標準差 (ddof=0) 以對應 SageMath 的原生行為
# ------------------------------------------------------------------------------
X_standardized = np.zeros_like(raw_X, dtype=np.float64)
# 遍歷每個特徵欄位
for col_idx in range(p_features):
single_col = raw_X[:, col_idx]
col_mean = np.mean(single_col)
col_std = np.std(single_col, ddof=0)
# 遍歷每個樣本列以填入標準化矩陣
for row_idx in range(n_samples):
X_standardized[row_idx, col_idx] = (raw_X[row_idx, col_idx] - col_mean) / col_std
# 印出四捨五入至小數點後 4 位的標準化矩陣(與 Sage 的 .n(digits=4) 相符)
print("\nSTANDARDIZED (CENTERED + SCALED) MATRIX X_ctr (4 decimal precision)")##
## STANDARDIZED (CENTERED + SCALED) MATRIX X_ctr (4 decimal precision)
## [[ 0.8764 -0.0436 -0.7746 -0.3993]
## [ 1.3599 -1.4375 -1.291 -1.5608]
## [ 0.8764 -0.7405 -0.2582 0.1815]
## [ 0.3929 1.3504 -1.8074 -0.98 ]
## [ 1.3599 1.3504 0.2582 0.1815]
## [ 0.8764 -0.7405 -1.291 -0.98 ]
## [ 0.3929 1.3504 -1.291 -2.1416]
## [ 0.8764 -0.0436 -0.2582 -0.3993]
## [-0.5742 -0.0436 0.7746 1.343 ]
## [-1.5412 -1.4375 1.291 0.1815]
## [-1.0577 0.6534 0.7746 1.343 ]
## [ 0.3929 1.3504 1.291 0.7623]
## [-1.0577 -0.7405 0.2582 0.7623]
## [-0.5742 -0.0436 0.7746 0.1815]
## [-1.5412 0.6534 0.2582 0.1815]
## [-1.0577 -1.4375 1.291 1.343 ]]
## ------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# 步驟 3:奇異值分解 (SVD):X_ctr = U @ S @ V.T
# np.linalg.svd 會回傳 U、奇異值向量及 V 的轉置 (Vh)
# ------------------------------------------------------------------------------
U, singular_vals, V_transpose = np.linalg.svd(X_standardized, full_matrices=False)
S_diag = np.diag(singular_vals)
# ------------------------------------------------------------------------------
# 步驟 4:計算主成分變異數與解釋變異數百分比
# Sage 公式:Var_PC[k] = singular_vals[k]^2 / n_samples
# Sage 公式:PropVar[k] = 100 * Var_PC[k] / total_variance
# ------------------------------------------------------------------------------
pc_variances = [(singular_vals[k] ** 2) / n_samples for k in range(p_features)]
total_var = sum(pc_variances)
explained_var_pct = [100 * var / total_var for var in pc_variances]
# 印出 PCA 變異數指標以供手動交叉驗證答案
print("Principal Component Variances (Var_PC, 4 decimal precision):")## Principal Component Variances (Var_PC, 4 decimal precision):
## [2.4303 0.9612 0.4647 0.1438]
##
## Percentage of Total Variance Explained by Each PC (Prop_Var %):
## [60.76 24.03 11.62 3.6 ]
cumulative_pc1pc2 = sum(explained_var_pct[:2])
print(f"\nCumulative explained variance PC1 + PC2 = {np.round(cumulative_pc1pc2, 2)}%")##
## Cumulative explained variance PC1 + PC2 = 84.79%
## (Confirms document claim: ~85% of total dataset information retained with 2 dimensions)
## ------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# 步驟 5:計算 PCA 分數矩陣 Z = U @ S (對應 Sage 的 PC_score = U*S)
# 僅提取 PC1 和 PC2 分數以進行二維視覺化
# ------------------------------------------------------------------------------
pc_score_matrix = U @ S_diag
pc1_pc2_scores = pc_score_matrix[:, 0:2]
# --------------------------
# 圖表 2:PCA 後的雙子圖(與參考的雙圖影像相符)
# 左側:陡坡變異數折線圖(標示為「Variances」的紅線)
# 右側:PC1 與 PC2 樣本投影的二維散佈圖(紅色數據點)
# --------------------------
fig_post, (ax_scree, ax_pc2d) = plt.subplots(1, 2, figsize=(14, 6))
# 左子圖:陡坡圖 (Scree Plot)
pc_index_axis = np.arange(1, p_features + 1)
ax_scree.plot(pc_index_axis, pc_variances, color="red", linewidth=2, marker="o")
ax_scree.set_title("Scree Plot of Principal Component Variances", fontsize=12)
ax_scree.set_xlabel("Principal Component Index (PC1, PC2, PC3, PC4)")
ax_scree.set_ylabel("Variances")
ax_scree.grid(True, alpha=0.3)
ax_scree.set_xticks(pc_index_axis)
# 在 PC2 處的垂直虛線(轉折點截斷線)
ax_scree.axvline(x=2, color="black", linestyle="--", alpha=0.6, label="Elbow Cutoff: Retain PC1 + PC2")
ax_scree.legend()
# 右子圖:二維 PC 分數散佈圖
ax_pc2d.scatter(pc1_pc2_scores[:, 0], pc1_pc2_scores[:, 1], color="red", s=40)
ax_pc2d.set_title("2D Projection of Survey Data (PC1 vs PC2)", fontsize=12)
ax_pc2d.set_xlabel(f"PC1 ({np.round(explained_var_pct[0],1)}% Variance)")
ax_pc2d.set_ylabel(f"PC2 ({np.round(explained_var_pct[1],1)}% Variance)")
ax_pc2d.axhline(y=0, color="gray", alpha=0.4)
ax_pc2d.axvline(x=0, color="gray", alpha=0.4)
ax_pc2d.grid(True, alpha=0.3)
fig_post.suptitle("Post-PCA Analysis Output Visualizations", fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.94])
# --------------------------
# 新程式碼區塊:將圖形儲存至目前工作目錄
# 輸出檔案名稱:pca_dual_output.png(與附帶的參考圖片相符)
# --------------------------
plt.savefig("pca_dual_output.png", dpi=300, bbox_inches="tight")
print("\nFigure saved successfully as 'pca_dual_output.png' in the current folder!")##
## Figure saved successfully as 'pca_dual_output.png' in the current folder!
# ==============================================================================
# 最終數值驗證總結輸出
# ==============================================================================
print("\n" + "="*72)##
## ========================================================================
## VERIFICATION COMPLETE: Full PCA pipeline exactly replicates original SageMath workflow
## 1. Raw pairwise feature plot generated BEFORE all PCA mathematical calculations
## 2. Standardization uses identical nested row/column loops from reference Sage code
## 3. SVD decomposition, PC variance, and explained variance ratios fully cross-validated
## 4. Post-PCA dual figures rendered: Red scree variance line + red 2D PC scatter points
print("5. PC1 + PC2 capture ~85% of total dataset variance for effective 4D → 2D dimension reduction")## 5. PC1 + PC2 capture ~85% of total dataset variance for effective 4D → 2D dimension reduction
## ========================================================================