import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from numba import njit
import mplcyberpunk
plt.style.use("cyberpunk")

@njit
def sc_jit(ref, seq):
    '''moving window subsequence comparison counting matches for each position of seq on ref'''
    n = len(ref) - len(seq) + 1
    cv = np.zeros(n, dtype=np.int8) 
    for i in range(n):
        count = 0
        for j in range(len(seq)):
            if ref[i + j] == seq[j]:
                count += 1
        cv[i] = count
    return cv
sc_jit(np.random.randint(0, 3, 100), np.random.randint(0, 3, 10));    # call it once for numba to compile the function

def encoder(seq):
    '''map sequence values into new axis to encode interaction strengths'''
    array = np.zeros((len(seq),4))
    for i in range(len(array)):
        array[i, seq[i]] = np.sqrt((2+(seq[i]>1))) # A U G C stored as 1 2 3 4 get energies 2 2 3 3
    return array

def sc_np(a, b): 
    '''sequence comparison that does not count mismatches, but energy'''
    A = encoder(a)
    B = encoder(b)
    cv = np.correlate(A.flatten(), B.flatten(), mode = "valid")[::4]
    return cv

ref = np.random.randint(low = 0, high = 4, size = 10**6) # generate random reference sequence to test against
query = ref[:10]                                         # query = binding sequence to evaluate

energy_left  = -sc_np(ref, query)  # is the total binding energy of query and ref for each possible position, assuming the physical AU = 2kT and GC = 3kT.
energy_right = -2.5 * sc_jit(ref, query)  # a simplified energy calculation assuming 2.5kT for each matching base pair.

boltzmann_left  = np.exp(-energy_left)
boltzmann_right = np.exp(-energy_right)

# making plots #
fig, (ax, bx) = plt.subplots(1, 2, figsize=(10, 3))
ax.plot(boltzmann_left);     
bx.plot(boltzmann_right);  
ax.set_title("2kT for AU and 3kT for GC")
bx.set_title("ΔE = 2.5 kT per (mis)match")
ax.set_ylabel("non-normalized binding probability")
bx.set_ylabel("non-normalized binding probability")
ax.set_xlabel("position of query on reference sequence");
bx.set_xlabel("position of query on reference sequence");
plt.tight_layout()
# For comparison there is a perfect match a the start of the sequence

# theoretical binding probability i.e. Boltzmann distribution, trying to reconsitute the figure
energy = np.array([8, 3, 2, 1, 0]) # assuming the binding energy is roughly 1kT * number of mismatches

fig, (ax, bx) = plt.subplots(1, 2, figsize=(10, 3))
ax.barh(range(len(energy)), np.exp(-energy)) 
ax.set_yticks(range(len(energy)), energy)
ax.invert_yaxis()
ax.set_ylabel("number of mismatches")
ax.set_xlabel("binding frequency [au]")  # frequency in arbitrary units because this is only the boltzmann factor, whithout normalisation (e.g. partition function)
ax.set_title("binding frequencies predicted by boltzmann  \n distribution with mismatch energy ~ 1kT");


# lets see how specific it will bind to a random reference
ref = np.random.randint(low = 0, high = 4, size = 10**6) # generate random reference sequence to test against
query = ref[:10]                                         # query = binding sequence to evaluate

energy_right = - 1.0*sc_jit(ref, query)  # a simplified energy calculation assuming 1kT for each matching base pair.

boltzmann_right = np.exp(-energy_right)

# making plots #
bx.plot(boltzmann_right);  
bx.set_ylabel("non-normalized binding probability")
bx.set_xlabel("position of query on reference sequence");
plt.tight_layout()

def plot_transcriptom_affinity(query_length, mm_energy):
    # lets generate a random query sequence and a random transcriptome
    ref = np.random.randint(low = 0, high = 4, size = 10**7) # random 10 Mbp transcriptom 
    query = np.random.randint(low = 0, high = 4, size = query_length) # query sequence is 12bp long
    # sc_jit is a SequenceComparison function returning the number of matches at each position.
    miss_matches = (len(query) - sc_jit(ref, query)) # number of nucleotide mismatches of query binding at each position on reference
    
    
    fig, (ax, bx, cx) = plt.subplots(1, 3, figsize = (14, 3))
    
    ## Lets plot the how often binding sites with x mismatches occur in the random 10Mbp transcriptome
    mm_count, bin_edges = np.histogram(miss_matches, bins = len(query)+1, range = (-0.5, len(query)+0.5), density = True)
    ax.invert_xaxis()
    MMx = (bin_edges[1:] + bin_edges[:-1])/2  # get the center points of the hist bins
    ax.bar(MMx, mm_count, label = "simulated")
    ax.set_title("frequency of binding sites in random transcriptome") 
    ax.set_xlabel("number of mismatches");
    ax.plot(MMx, stats.binom.pmf(MMx, len(query), 3/4), label = f"binom(n={len(query)}, p = 3/4)", color = "grey", marker = "o", lw = 0)
    ax.legend()

    # Lets plot the boltzmann distribution (again)
    bind_freq = np.exp(-MMx*mm_energy) # as said before the energy is the number of missmatches * 1 for RBP-RNA binding, using bin_edges here to get the mis match values used in plotted in the diagrams
    bx.bar(MMx, bind_freq)
    bx.invert_xaxis()
    bx.set_title(f"Boltzmann factor using {mm_energy}kT") # in arbitrary units since its just the boltzmann factor, without normalisation
    bx.set_xlabel("number of mismatches");

    # Lets multiply the first two plots: overall binding affinity = number of sites * affinity to individual site
    cx.bar(MMx, bind_freq*mm_count*len(ref), label =" off-target binding")
    cx.invert_xaxis()
    cx.set_title("frequency of interactions")
    cx.set_xlabel("number of mismatches");
    cx.bar(0, bind_freq, zorder = -1, label ="correct target site");
    
    # because I dont have the partitionsum, the boltzmann factor alone is only proportional to the probability, but its not normalized. Therefore everything is in arbitrary units
    ax.set_yticks([])
    bx.set_yticks([])
    cx.set_yticks([])
    cx.legend()
    plt.tight_layout();
    
plot_transcriptom_affinity(12, 1)

### increasing interaction strength ###
plot_transcriptom_affinity(12, 2.5)

### Increasing sequence length ###
plot_transcriptom_affinity(24, 1)

### increasing both interaction strength and binding sequence length ###
plot_transcriptom_affinity(24, 2.5)

RNA-RNA binding - Boltzmann probabilities from hydrogen bond energies¶

RNA Pumby binding energy from binding frequencies¶

One specific vs many unspecific binding sites¶

Limitations of this model¶