Decomposition Simulation
The following is a annotated part of the source code we used for the simulation of the decomposition to answer the questions how many trials would be required to achieve a reliable decomposition.
def get_score(data):
if data.shape[1] > 1:
# if we use at least two trials for PCA, we flip the trials to
# make sure the first component aligns in direction
flipper = orient_principal(data)
data = flipper * data
# and calculate the score (s)
c, s, v = pca_largest(data)
else: # if we have only one trial, this is the first component
s = data
# the sign of a score/coefficient pair is arbitrary
# if the positive peak in the score occurs later than the negative
# peak, we invert the score to align all scores across all bootstrap
# repetitions
if np.argmax(s[:, 0]) > np.argmin(s[:, 0]):
return -s[:, 0]
else:
return s[:, 0]
# assuming that data is the [samples x trials] matrix of all trials across
# all subjects, we remembert the maximal number of trials NT and define
# the range of trials picked over which we want to explore the simulation
NT = data.shape[1]
ratio = np.hstack(
(
np.arange(2, 10, 1, dtype=int),
np.arange(10, 100, 10, dtype=int),
np.arange(100, 1000, 100, dtype=int),
)
)
As = []
for pratio in ratio:
# We balanced the number of repetitions, i.e. we run more repetitions
# for small sample sizes to increase the accuracy and less repetitions
# for high sample sizes to reduce computational cost.
# Additionally, the maximal number of sample sizes is bound to the
# maximally available trials in the dataset.
reps = int(NT / pratio)
with tqdm(
total=reps, desc=f"Bootstrap PCA reliability {pratio}"
) as pbar:
A = []
for rep in range(reps):
shuffled = np.random.choice(range(NT), NT)
# train the first score on the partial dataset
pick = shuffled[0:pratio]
s1 = get_score(np.take(data, pick, axis=1))
# train the second score on another part of the dataset
pick = shuffled[pratio : 2 * pratio]
s2 = get_score(np.take(data, pick, axis=1).copy())
# calculate their correlation coefficient
a, _ = pearsonr(s1, s2)
A.append(a)
pbar.update(1)
# average the correlation coefficient under transformation
# to account for the bounds and non-normality
As.append(np.tanh(np.arctanh(A).mean()))
fig, ax = plt.subplots(1, 1, figsize=(6, 4), dpi=300)
ax.semilogx(ratio, As, "k")
ax.set_ylabel("Waveform Reproducibility")
ax.set_xlabel("Available Trials")
ax.grid()