# standard python modules
import numpy as np;import time;import copy

# plotting utilities
import matplotlib.pyplot as plt;import matplotlib as mpl;import matplotlib.cm as cm;import matplotlib.colors as colors
%matplotlib inline
cmap = mpl.cm.inferno;mpl.rcParams['xtick.labelsize'] = 12;mpl.rcParams['ytick.labelsize'] = 12;mpl.rcParams['font.weight'] = 'medium';mpl.rcParams['axes.linewidth'] = 1.5;mpl.rcParams['xtick.major.width'] = 1.5;mpl.rcParams['xtick.minor.width'] = 0.75;mpl.rcParams['xtick.minor.visible'] = True;mpl.rcParams['ytick.major.width'] = 1.5;mpl.rcParams['ytick.minor.width'] = 0.75;mpl.rcParams['ytick.minor.visible'] = True

ndim = 1   # as this is SSA, we only have one dimension of data
N    = 200 # set the length of the data

data = np.zeros([ndim,N])
data[0] = np.cos(np.arange(0,N,1)*np.pi/12.)

plt.figure(figsize=(4,3),facecolor='white')
plt.plot(data[0],color='black')
plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')
plt.tight_layout()

def embed_data(data,L,K,ndim=1,norm=False):
    """
    Construct an embedded time series (trajectory matrix) based on the input data.
    This function embeds the input time series following the strategy outlined by Weinberg & Petersen (2021). The trajectory matrix `Y` is formed by arranging augmented vectors as rows, with an option to normalize each data stream by removing the mean and scaling by the standard deviation.

    Parameters
    ----------
    data : (array-like) The input time series data streams. Each data stream should be structured as data[n], where `n` is the index for each stream.
    L    : (int) The window length, which determines the number of rows in the trajectory matrix.
    K    : (int) The length of the augmented vectors (number of columns per data stream).
    ndim : (int, optional) The number of data streams to include in the embedding process. Default is 1.
    norm : (bool, optional) If True, each data stream is normalized by removing the mean and dividing by the standard deviation. If False, the raw data values are used. Default is False.

    Returns
    -------
    Y    : (ndarray) The trajectory matrix of shape (L, K * ndim), where each row represents an augmented vector from the input data streams.

    Notes
    -----
    - If `norm=True`, the data are standardized (mean-subtracted and divided by the standard deviation) to remove trends and variations in scale.
    """
    Y = np.zeros([L,K*ndim])
    for n in range(0,ndim):
        for j in range(0,K):
            for i in range(0,L):
                if norm:
                    Y[i, K*n+j] = (data[n][i + j] - np.nanmean(data[n]))/np.nanstd(data[n])
                else:
                    Y[i, K*n+j] = data[n][i + j]
    print('Shape of Y:',Y.shape)
    return Y

L = 60   # set the window length
K = N-L+1 # calculate the length of the lagged series

#embed the data
T = embed_data(data,L,K,1)

plt.figure(figsize=(4,3),facecolor='white')
plt.imshow(T,cmap=cm.coolwarm,aspect='equal')
plt.title('trajectory matrix')
plt.colorbar(label='amplitude')
plt.ylabel('rows are windowed (length L)')
_ = plt.xlabel('columns are lagged vectors (length K)')

Shape of Y: (60, 141)

def make_covariance(T,K,full=True):
    """
    Estimate the covariance matrix for a given trajectory matrix.
    This function computes the covariance matrix C_X based on the provided trajectory matrix using the formula in Equation 9. The output can be either the full covariance matrix or a reduced version, depending on the `full` parameter.

    Parameters
    ----------
    T    : (array-like) The trajectory matrix, representing time series data with delayed coordinates.
    K    : (int) The length of the augmented vectors, which corresponds to the number of columns in the trajectory matrix.
    full : (bool, optional) If True, computes the full covariance matrix as T^T * T / K. If False, computes a reduced covariance matrix as T * T^T / K. Default is True.

    Returns
    -------
    C    : (ndarray) The covariance matrix of shape (K, K) if `full=True`, or of shape (M, M), where M is the number of rows in T, if `full=False.
    """
    if full:
        C = np.dot(T.T,T)/K
    else:
        C = np.dot(T,T.T)/K
    print('C shape:',C.shape)
    return C

C = make_covariance(T,K)

plt.figure(figsize=(4,3),facecolor='white')

plt.imshow(C,cmap=cm.coolwarm)
plt.title('covariance matrix')
plt.colorbar(label='amplitude')
plt.ylabel('dim 1 (length K)')
_ = plt.xlabel('dim 2 (length K)')

C shape: (141, 141)

def make_pcs(C,T,ndim=1,full=True):
    """perform the SVD on the covariance matrix to construct the PCs.
    
    inputs
    ---------
    C     : (array) the covariance matrix   
    Y     : (array) the trajectory matrix
    ndim  : (int, default 1) the number of MSSA dimensions
    
    returns
    ---------
    PCs   : (array) the principal components (in rows)
    U     : (array)
    SV    : (vector) the list of singular vectors
    """
    U,SV,V = np.linalg.svd(C, full_matrices=True)
    # the columns of U are the left singular vectors
    # the columns of V are the right singular vectors
    # SV are the singular values
    
    # to recover the PCs, dot the trajectory matrix and the left singular vectors
    # the projection of the time series Y onto the EOF (eigenvectors), U
    if full:
        PCs = np.dot(T,U)
    else:
        PCs = np.dot(V,T).T

    print('PC shape:',PCs.shape)
    
    if full:
        print('Left singular vector shape:',U.shape)
        return PCs,U,SV
    else:
        print('Left singular vector shape:',V.shape)
        return PCs,V.T,SV

# run it!
PC,EOF,SV = make_pcs(C,T)

PC shape: (60, 141)
Left singular vector shape: (141, 141)

# the columns of EOFs are the set of empirical orthogonal vectors: plot them!
plt.figure(figsize=(4,3),facecolor='white')

for i in range(K-1,-1,-1):
    plt.plot(EOF[:,i],color=cm.viridis(i/9.))
    
plt.title('empirical orthogonal functions')
plt.ylabel('EOF value')
_ = plt.xlabel('zero origin time (length K)')

# the columns of PCs are the set of principal components: plot them!
plt.figure(figsize=(4,3),facecolor='white')

for i in range(9,-1,-1):
    plt.plot(PC[:,i],color=cm.viridis(i/9.))
    
plt.title('principal components')
plt.ylabel('PC value')
_ = plt.xlabel('abscissa (length K)')

# plot the run of singular values
plt.figure(figsize=(4,3),facecolor='white')

SVnonoise = np.copy(SV) # save these for later...
plt.plot(np.log10(SV),color='black')
plt.title('singular value curve')
plt.ylabel('log$_{10}$ singular value value')
_ = plt.xlabel('PC number (length K)')

def reconstruct(PC,EOF,pcnum,K,nd=0):
    """Average antidiagonal elements of a 2d array
    
    inputs
    -----------
    PC    : (array) PCs
    EOF   : (array) empirical orthogonal functions
    pcnum : (int) which PC to consider
    K     : (int) length of the augmented vectors
    ndim  : (int, default 0) which MSSA dimension to consider

    returns
    -------
    x1d   : (np.array) 1d numpy array representing averaged antidiangonal elements of A

    thanks to: https://codereview.stackexchange.com/questions/195635/numpy-2d-array-anti-diagonal-averaging
    """
    A = np.dot(np.array([PC[:,pcnum]]).T,np.array([EOF[K*nd:K*(nd+1),pcnum]]))
    
    x1d = [np.mean(A[::-1, :].diagonal(i)) for i in range(-A.shape[0] + 1, A.shape[1])]

    return np.array(x1d)

O = reconstruct(PC,EOF,0,K)

plt.figure(figsize=(4,3),facecolor='white')
plt.plot(O,color='black')
plt.plot(data[0],color='grey',linestyle='dashed')
plt.title('first PC reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')
plt.tight_layout()

O = reconstruct(PC,EOF,0,K)
O += reconstruct(PC,EOF,1,K) # this adds PC1

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(O,color='black')
plt.plot(data[0],color='grey',linestyle='dashed')
plt.title('first PC $group$ reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

ndim = 1
N = 200
L = 70
K = N-L+1

data = np.zeros([ndim,N])

data[0] = np.cos(np.arange(0,N,1)*np.pi/12.) # start with a simple sinusoid
cleandata = np.copy(data[0]) # record the clean data

# add some gaussian noise, at the level of the signal!
noiselevel = 1.5
data[0] += np.random.normal(0.0,noiselevel,size=data[0].size)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(data[0],color='black')
plt.plot(cleandata,color='gray',linestyle='dashed')
plt.title('input data series');plt.ylabel('amplitude');plt.xlabel('sample number (N)')
plt.tight_layout()

# perform the SSA steps
T = embed_data(data,L,K,1)
C = make_covariance(T,K)
PC,EOF,SV = make_pcs(C,T)
O = reconstruct(PC,EOF,0,K,0)
O = reconstruct(PC,EOF,1,K,0)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(2*O,color='black')
plt.plot(cleandata,color='gray',linestyle='dashed')
#plt.plot(data[0],color='red',linestyle='dashed')
plt.title('first PC $group$ reconstruction');plt.xlabel('sample number (length $N$)');plt.ylabel('amplitude');plt.tight_layout()

Shape of Y: (70, 131)
C shape: (131, 131)
PC shape: (70, 131)
Left singular vector shape: (131, 131)

def make_dft(data):
    freqs = np.fft.fftfreq(len(data), d=1.0)
    DFT = np.fft.fft(data, axis=-1)

    nfreqs = int(np.floor(len(data)/2.))
    return freqs[0:nfreqs],np.abs(DFT)[0:nfreqs]

ff,ftD = make_dft(data[0])
ff,ftR = make_dft(O)

plt.figure(figsize=(4,3),facecolor='white')
plt.plot(ff,ftD,color='gray',linestyle='dashed')
plt.plot(ff,ftR,color='black')
plt.title('DFT frequency recovery');plt.xlabel('frequency (inverse time)');  plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

O = reconstruct(PC,EOF,2,K,0)
for pc in range(3,45):
    O += reconstruct(PC,EOF,pc,K,0)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(O,color='black')
plt.plot(cleandata,color='gray',linestyle='dashed')
plt.title('second PC $group$ reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

# check in on the SV curve
plt.figure(figsize=(4,3),facecolor='white')

plt.plot(np.log10(SV),color='black')
plt.plot(np.log10(SVnonoise),color='gray',linestyle='dashed')
plt.title('singular value curve')
plt.ylabel('singular value value')
_ = plt.xlabel('PC number (length K)')

ndim = 1
N = 200
L = 50
K = N-L+1

data = np.zeros([ndim,N])
data[0] = np.cos(np.arange(0,N,1)*np.pi/12.) # start with a simple sinusoid

plt.figure(figsize=(4,3))#,facecolor='white')
plt.plot(data[0],color='grey',linestyle='dotted',label='periodic trend')

# add a quadratic (slow) trend
quadratictrend = 0.0001*(np.arange(0,N,1)**2.)
data[0] += quadratictrend
plt.plot(quadratictrend,color='grey',linestyle='dashed',label='growing trend')

cleandata = np.copy(data[0]) # record the clean data
# add some gaussian noise, at the level of the signal!
noiselevel = 1.0
data[0] += np.random.normal(0.,noiselevel,size=data[0].size)

plt.plot(data[0],color='black',label='measured signal')
plt.ylabel('amplitude');_ = plt.xlabel('sample number (N)');plt.legend(frameon=False);plt.tight_layout()

# perform the SSA steps
T = embed_data(data,L,K,1)
C = make_covariance(T,K)
PC,EOF,SV = make_pcs(C,T)


plt.figure(figsize=(4,3))#,facecolor='white')
plt.plot(data[0],color='black',label='measured signal')

O = reconstruct(PC,EOF,0,K,0)
plt.plot(O,color='red',linestyle='dashed',label='first group')
 
O = reconstruct(PC,EOF,1,K,0)
O += reconstruct(PC,EOF,2,K,0)
plt.plot(O,color='red',linestyle='dotted',label='second group')

O = reconstruct(PC,EOF,3,K,0)
for k in range(4,50):
    O += reconstruct(PC,EOF,k,K,0)
#plt.plot(O,color='red',linestyle='solid',lw=0.5,label='nullity')

plt.xlabel('sample number (length $N$)');plt.ylabel('amplitude');plt.legend(frameon=False);plt.tight_layout()

Shape of Y: (50, 151)
C shape: (151, 151)
PC shape: (50, 151)
Left singular vector shape: (151, 151)

plt.figure(figsize=(4,3))#,facecolor='white')
plt.plot(data[0],color='black',label='measured signal')

O = reconstruct(PC,EOF,3,K,0)
for k in range(4,50):
    O += reconstruct(PC,EOF,k,K,0)
plt.plot(O,color='red',linestyle='solid',lw=0.5,label='nullity')

plt.xlabel('sample number (length $N$)');plt.ylabel('amplitude');plt.legend(frameon=False);plt.tight_layout()

# now compare to the original series
plt.figure(figsize=(4,3))#,facecolor='white')

O = reconstruct(PC,EOF,0,K,0)
plt.plot(O,color='red',linestyle='dashed',label='first group')

O = reconstruct(PC,EOF,1,K,0)
O += reconstruct(PC,EOF,2,K,0)
plt.plot(O,color='red',linestyle='dotted',label='second group')

data = np.zeros([ndim,N])
data[0] = np.cos(np.arange(0,N,1)*np.pi/12.) # start with a simple sinusoid
plt.plot(data[0],color='grey',linestyle='dotted',label='periodic trend')

# add the quadratic (slow) trend
data[0] += quadratictrend
plt.plot(0.0001*(np.arange(0,N,1)**2.),color='grey',linestyle='dashed',label='growing trend')

plt.xlabel('sample number (length $N$)');plt.ylabel('amplitude');plt.legend(frameon=False);plt.tight_layout()

# reconstruct the second group
O = reconstruct(PC,EOF,1,K,0)
O += reconstruct(PC,EOF,2,K,0)

# check on the DFT recovery
plt.figure(figsize=(4,3),facecolor='white')

ff,ft = make_dft(data[0])
plt.plot(ff,ft,color='gray',linestyle='dashed')

ff,ft = make_dft(O)
plt.plot(ff,ft,color='black')

plt.title('DFT frequency recovery');plt.xlabel('frequency (inverse time)');plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

def wCorr(R):
    """
    make the w-correlation matrix.
    
    inputs
    --------
    R     : (array) the reconstructed elements, stacked.
    
    returns
    -----------
    wcorr : (array) the w-correlation matrix
    """
    
    numT   = R.shape[0]
    numW   = R.shape[1]
    Lstar  = np.nanmin([numT - numW, numW]);
    Kstar  = np.nanmax([numT - numW, numW]);

    wcorr = np.zeros([numW, numW])
    for m in range(0,numW):
        for n in range(m,numW):
            for i in range(0,numT):
                if   (i < Lstar): w = i;
                elif (i < Kstar): w = Lstar;
                else            : w = numT - i + 1
                
                wcorr[m, n] += w * R[i, m]*R[i, n]

    #// Normalize
    for m in range(0,numW):
        for n in range(m+1,numW):
            if (wcorr[m, m]>0.0 and wcorr[n, n]>0.0):
                wcorr[m, n] /= np.sqrt(wcorr[m, m]*wcorr[n, n])


    #// Unit diagonal
    for m in range(0,numW): 
        wcorr[m, m] = 1.0

    #// Complete
    for m in range(0,numW):
        for n in range(0,m):
            wcorr[m, n] = wcorr[n, m]

    return wcorr

# select a maximum PC to consider
pcmax = 10

# build a reconstruction matrix (PC contributions stacked on top of one another)
RC = np.zeros([N,pcmax])
for i in range(0,pcmax):
    RC[:,i] = reconstruct(PC,EOF,i,K,nd=0)
    
# compute the w-correlation matrix
R = wCorr(RC)

plt.figure(figsize=(4,3),facecolor='white')


plt.imshow(np.log10(np.abs(R)),cmap=cm.Greys)
plt.colorbar(label='log correlation (higher is more correlated)')
plt.xlabel('PC number')
_ = plt.ylabel('PC number')

ndim = 1
N = 200
L = 50
K = N-L+1

data = np.zeros([ndim,N])

data[0] = np.cos(np.arange(0,N,1)*np.pi/12.) # start with a simple sinusoid
sample1data = np.copy(data[0]) # record the first signal

# add a chirp
omega2 = 0.04; chirp = 6.
omega = omega2*(1.0 + chirp*np.arange(0,N,1)/N);
data[0] += 0.5*np.cos(np.arange(0,N,1)*omega)
sample2data = 0.5*np.cos(np.arange(0,N,1)*omega) # record the second signal
cleandata = np.copy(data[0]) # record the clean data

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(data[0],color='black')
plt.plot(sample1data,color='gray',lw=1.)
plt.plot(sample2data,color='gray',lw=1.)

plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')

# perform the SSA steps
T = embed_data(data,L,K,1)
C = make_covariance(T,K)
PC,EOF,SV = make_pcs(C,T)
O = reconstruct(PC,EOF,0,K,0)
O += reconstruct(PC,EOF,1,K,0)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(O,color='black')
plt.plot(sample1data,color='gray',linestyle='dashed')
plt.plot(data[0],color='red',linestyle='dashed')
plt.title('first PC $group$ reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Shape of Y: (50, 151)
C shape: (151, 151)
PC shape: (50, 151)
Left singular vector shape: (151, 151)

Text(0, 0.5, 'amplitude')

O = reconstruct(PC,EOF,2,K,0)
O += reconstruct(PC,EOF,3,K,0)
O += reconstruct(PC,EOF,4,K,0)
O += reconstruct(PC,EOF,5,K,0)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(O,color='black')
plt.plot(sample2data,color='gray',linestyle='dashed')
plt.plot(data[0],color='red',linestyle='dashed')
plt.title('second PC $group$ reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

plt.figure(figsize=(4,3),facecolor='white')

# check on the DFT recovery
ff,ft = make_dft(data[0])
plt.plot(ff,ft,color='gray',linestyle='dashed')

ff,ft = make_dft(O)
plt.plot(ff,ft,color='black')

plt.title('DFT frequency recovery')
plt.xlabel('frequency (inverse time)')
plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

# select a maximum PC to consider
pcmax = 10

# build a reconstruction matrix (PC contributions stacked on top of one another)
RC = np.zeros([N,pcmax])
for i in range(0,pcmax):
    RC[:,i] = reconstruct(PC,EOF,i,K,nd=0)
    
# compute the w-correlation matrix
R = wCorr(RC)

plt.figure(figsize=(4,3),facecolor='white')


plt.imshow(np.log10(np.abs(R)),cmap=cm.Greys)
plt.colorbar(label='log correlation (higher is more correlated)')
plt.xlabel('PC number')
_ = plt.ylabel('PC number')

ndim = 1
N = 100
L = 50
K = N-L+1

data = np.zeros([ndim,N])

data[0] = (np.arange(0,N,1)*0.01) % 0.3

cleandata = np.copy(data[0]) # record the clean data

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(data[0],color='black')
plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')

# perform the SSA steps
T = embed_data(data,L,K,1)
C = make_covariance(T,K)
PC,EOF,SV = make_pcs(C,T)
plt.figure(figsize=(4,3),facecolor='white')


for i in range(0,20,2):
    if i == 0:
        O = reconstruct(PC,EOF,0,K,0)
        O += reconstruct(PC,EOF,1,K,0)
    else:
        O += reconstruct(PC,EOF,i,K,0)
        O += reconstruct(PC,EOF,i+1,K,0)
    plt.plot(O,color=cm.viridis_r(i/9.))
plt.plot(data[0],color='red',linestyle='dashed')
plt.title('first PC $group$ reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

ndim = 2   # as this is SSA, we only have one dimension of data
N    = 200 # set the length of the data

data = np.zeros([ndim,N])
data[0] = np.cos(np.arange(0,N,1)*np.pi/12.)
data[1] = np.cos(np.arange(0,N,1)*np.pi/12.)
cleandata = np.copy(data[0])
plt.figure(figsize=(4,3),facecolor='white')

plt.plot(data[0],color='black')
plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')

L = 50   # set the window length
K = N-L+1 # calculate the length of the lagged series

#embed the data
T = embed_data(data,L,K,ndim)

plt.figure(figsize=(4,3),facecolor='white')

plt.imshow(T,cmap=cm.coolwarm)
plt.title('trajectory matrix')
plt.colorbar(label='amplitude')
plt.ylabel('rows are windowed (length L)')
_ = plt.xlabel('columns are lagged vectors (length K)')

Shape of Y: (50, 302)

# take a look at the covariance matrix: lots of block structure here!
C = make_covariance(T,K)  # not aware of dimensions

plt.figure(figsize=(4,3),facecolor='white')

plt.imshow(C,cmap=cm.coolwarm)
plt.title('covariance matrix')
plt.colorbar(label='amplitude')
plt.ylabel('dim 1 (length K)')
_ = plt.xlabel('dim 2 (length K)')

C shape: (302, 302)

# perform the additional SSA steps
PC,EOF,SV = make_pcs(C,T) # not aware of dimensions

# pick the dimension to reconstruct
reconstruct_stream = 0
O = reconstruct(PC,EOF,0,K,reconstruct_stream)
O += reconstruct(PC,EOF,1,K,reconstruct_stream)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(O,color='black')
plt.plot(cleandata,color='gray',linestyle='dashed')
plt.plot(data[0],color='red',linestyle='dashed')
plt.title('first PC $group$ reconstruction, stream 1')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

PC shape: (50, 302)
Left singular vector shape: (302, 302)

Text(0, 0.5, 'amplitude')

ndim = 2   # as this is SSA, we only have one dimension of data
N    = 200 # set the length of the data

data = np.zeros([ndim,N])
data[0] = np.cos(np.arange(0,N,1)*np.pi/12.)
data[1] = np.cos(np.arange(0,N,1)*np.pi/12.)
cleandata = np.copy(data[0])

noiselevel = 2.5
data[0] += np.random.normal(0.,noiselevel,size=data[0].size)
data[1] += np.random.normal(0.,noiselevel,size=data[0].size)

plt.figure(figsize=(5,3))
plt.plot(data[0],color='black',lw=1.)
plt.plot(data[1],color='blue',lw=1.)
plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')

L = 100   # set the window length
K = N-L+1 # calculate the length of the lagged series
T = embed_data(data,L,K,ndim)
C = make_covariance(T,K)  # not aware of dimensions
PC,EOF,SV = make_pcs(C,T) # not aware of dimensions

# pick the dimension to reconstruct
reconstruct_stream = 1
O = reconstruct(PC,EOF,0,K,reconstruct_stream)
O += reconstruct(PC,EOF,1,K,reconstruct_stream)

plt.figure()
plt.plot(O,color='black')
plt.plot(data[reconstruct_stream],color='grey',linestyle='dashed')
plt.title('first PC $group$ reconstruction, stream 1')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Shape of Y: (100, 202)
C shape: (202, 202)
PC shape: (100, 202)
Left singular vector shape: (202, 202)

Text(0, 0.5, 'amplitude')

# bring in some processed data. columns are
# 0: time
# 1-2: m=1, n=0
# 3-4: m=2, n=0
# 5-6: m=1, n=1
# ...etc
indir = ''
prefix = 'processed/seriesm1m2.'
Data = np.genfromtxt(indir+prefix+'data') 
Data[:,0] *= 0.004 # put in simulation times

ndim = 2   # as this is SSA, we only have one dimension of data
N    = Data.shape[0] # set the length of the data

data = np.zeros([ndim,N])
data[0] = Data[:,3]
data[1] = Data[:,4]

plt.figure(figsize=(5,3))
plt.plot(data[0],color='black',lw=1.)
plt.plot(data[1],color='blue',lw=1.)
plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')

# only do the expensive steps once!!
L = 200   # set the window length
K = N-L+1 # calculate the length of the lagged series
T = embed_data(data,L,K,ndim)
C = make_covariance(T,K)  # not aware of dimensions
PC,EOF,SV = make_pcs(C,T) # not aware of dimensions

# pick the dimension to reconstruct
reconstruct_stream = 1
O = reconstruct(PC,EOF,0,K,reconstruct_stream)
O += reconstruct(PC,EOF,1,K,reconstruct_stream)

plt.figure()
plt.plot(O,color='black')
plt.plot(data[reconstruct_stream],color='grey',linestyle='dashed')
plt.title('first PC $group$ reconstruction, stream 1')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

# bring in some processed data. columns are
# 0: time
# 1-2: m=1, n=0
# 3-4: m=2, n=0
# 5-6: m=1, n=1
# ...etc
indir = ''
prefix = 'processed/seriesm1m2.'
Data = np.genfromtxt(indir+prefix+'data') 
Data[:,0] *= 0.004 # put in simulation times

ndim = 4   # as this is SSA, we only have one dimension of data
N    = Data.shape[0] # set the length of the data

stride = 1
inputlength = Data[::stride,1].size
data = np.zeros([ndim,inputlength])

data[0] = Data[::stride,1]
data[1] = Data[::stride,2]
data[2] = Data[::stride,3]
data[3] = Data[::stride,4]

plt.figure(figsize=(5,3))
plt.plot(data[0],color='black',lw=1.)
plt.plot(data[1],color='blue',lw=1.)
plt.title('input data series')
plt.ylabel('amplitude')
_ = plt.xlabel('sample number (N)')

# only do the expensive steps once!!
L = 30   # set the window length
K = N-L+1 # calculate the length of the lagged series
#K = inputlength-L+1
T = embed_data(data,L,K,ndim)
C = make_covariance(T,K)  # not aware of dimensions
PC,EOF,SV = make_pcs(C,T) # not aware of dimensions

# pick the dimension to reconstruct
reconstruct_stream = 1
O = reconstruct(PC,EOF,0,K,reconstruct_stream)
O += reconstruct(PC,EOF,1,K,reconstruct_stream)

plt.figure()
plt.plot(O,color='black')
plt.plot(data[reconstruct_stream],color='grey',linestyle='dashed')
plt.title('first PC $group$ reconstruction, stream 1')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

# select a maximum PC to consider
pcmax = 10

# build a reconstruction matrix (PC contributions stacked on top of one another)
RC = np.zeros([N,pcmax])
#RC = np.zeros([inputlength,pcmax])
for i in range(0,pcmax):
    RC[:,i] = reconstruct(PC,EOF,i,K,0)
    
# compute the w-correlation matrix
R = wCorr(RC)

plt.imshow(np.log10(np.abs(R)),cmap=cm.Greys)
plt.colorbar(label='log correlation (higher is more correlated)')
plt.xlabel('PC number')
_ = plt.ylabel('PC number')

N=200 # set the initial number of data points
R=100 # set the number of data points to remove
ndim=1
testrange = np.arange(0,N,1)
testdata = np.cos(np.arange(0,N,1)*np.pi/12.)

exclude = np.unique(np.random.randint(0,N,R)) # pick samples to remove

R = len(exclude)
data = np.zeros([ndim,N-R])
dtime = np.zeros(N-R)


i = 0
for v in range(0,N):
    if v not in exclude:
        data[0][i] = testdata[v]
        dtime[i] = testrange[v]
        i+=1
        
plt.plot(dtime,data[0],color='grey',linestyle='dashed')
#plt.plot(testrange,testdata,color='black')

[<matplotlib.lines.Line2D at 0x112059810>]

# now perform SSA steps

N = len(data[0])

L = 100   # set the window length
K = N-L+1 # calculate the length of the lagged series
Y = embed_data(data,L,K,ndim)
C = make_covariance(Y,K)  # not aware of dimensions
PC,EOF,SV = make_pcs(C,Y) # not aware of dimensions

Shape of Y: (100, 19)
C shape: (19, 19)
PC shape: (100, 19)
Left singular vector shape: (19, 19)

# examine the reconstruction
O = reconstruct(PC,EOF,0,K,0)
O += reconstruct(PC,EOF,1,K,0)

plt.figure()
plt.plot(dtime,O,color='black')
plt.plot(testrange,testdata,color='grey',linestyle='dashed')
plt.title('first PC reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Text(0, 0.5, 'amplitude')

ndim = 1
N = 200
L = 70
K = N-L+1

# set up some gaussian noise
data = np.zeros([ndim,N])
noiselevel = 1.5
data[0] = np.random.normal(0.0,noiselevel,size=data[0].size)

plt.figure(figsize=(4,3),facecolor='white')
plt.plot(data[0],color='black')
plt.title('input data series');plt.ylabel('amplitude');plt.xlabel('sample number (N)')
plt.tight_layout()

# perform the SSA steps
T = embed_data(data,L,K,1)
C = make_covariance(T,K)
PC,EOF,SV = make_pcs(C,T)
O = reconstruct(PC,EOF,0,K,0)
O += reconstruct(PC,EOF,1,K,0)

plt.figure(figsize=(4,3),facecolor='white')

plt.plot(O,color='black')
plt.plot(data[0],color='grey',linestyle='dashed')
plt.title('first PC $group$ reconstruction')
plt.xlabel('sample number (length $N$)')
plt.ylabel('amplitude')

Shape of Y: (70, 131)
C shape: (131, 131)
PC shape: (70, 131)
Left singular vector shape: (131, 131)

Text(0, 0.5, 'amplitude')

ndim = 1   # as this is SSA, we only have one dimension of data
N    = 200 # set the length of the data
L = 40   # set the window length
K = N-L+1 # calculate the length of the lagged series
full=False # note that the strategy only works with (full=True,Lw=K) or (full=False,Lw=L). We'll choose the former because it is quicker.

data = np.zeros([ndim,N])
data[0] = np.cos(np.arange(0,N,1)*np.pi/12.)
Y = embed_data(data,L,K,1)
C = make_covariance(Y,K,full=full)
PC,EOF,SV = make_pcs(C,Y,full=full)
O = reconstruct(PC,EOF,0,K,0)
O = reconstruct(PC,EOF,1,K,0)

Shape of Y: (40, 161)
C shape: (40, 40)
PC shape: (161, 40)
Left singular vector shape: (40, 40)

def recurrencecoefficients(P,I,L):
    # try implementing the recursion strategy to get R
    # the verticality coefficient of Lr (the linear space defined by the eigenanalysis)
    nu2 = np.sum([P[-1,i]**2 for i in range(0,I)])
    print(nu2)
    R = np.sum([(1/(1-nu2))*P[-1,i]*P[0:(L),i] for i in range(0,I)],axis=0)
    return R

# two components should be sufficient to recover the dynamics
I = 2
Lw = L
R = recurrencecoefficients(EOF,I,Lw)
plt.plot(R,color='black',label='minimum LR')

# what happens if we add another group?
I = 3
R = recurrencecoefficients(EOF,I,Lw)
plt.plot(R,color='red',label='noisy LR')
plt.legend(frameon=False);plt.xlabel('linear recurrence term (length $L$)');_=plt.ylabel('amplitude')

0.04737397212828026
0.06041202108558286

I = 2 # use the minimum LR
Lw = L

O = reconstruct(PC,EOF,0,K)
for i in range(1,I): O += reconstruct(PC,EOF,i,K)
tvals = np.arange(0,len(O),1)

# compute the recurrence coefficients
R = recurrencecoefficients(EOF,I,Lw)

# how many forecasting steps should we take?
h = 100

# take the first forecasting step
lrecent = O[-(Lw):];ttime = tvals[-1];Z = np.concatenate([O,np.array([np.sum(R*lrecent)])]); totalT = np.concatenate([tvals,np.array([ttime])])

for ih in range(1,h):
    lrecent = Z[-(Lw):];ttime += 1;Z = np.concatenate([Z,np.array([np.sum(R*lrecent)])]);totalT = np.concatenate([totalT,np.array([ttime])])

plt.plot(totalT,Z,color='red',label='Extended Series');plt.plot(tvals,O,color='black',label='Original Series');plt.legend(frameon=False)
plt.legend(frameon=False);plt.xlabel('sample number');_=plt.ylabel('amplitude')

0.04737397212828026

Demystifying multichannel Singular Spectral Analysis (mSSA)¶

Mike Petersen, 16 May 2022¶

updated for 19 September 2024¶

Example 0¶

Example 1¶

Example 2¶

Example 3 (optional)¶

Example 4 (optional)¶

Example 5¶

Example 6¶

Example 7¶

Example 8¶

Example 9: What about missing data?¶

Example 10: Nothing from nothing¶

Example 11: Linear Recurrence Relations¶

A final question I'm thinking about.¶

Thanks for trying out mSSA!¶