Constructing a co-occurrence matrix in python pandas

I know how to do this in R. But, is there any function in pandas that transforms a dataframe to an nxn co-occurrence matrix containing the counts of two aspects co-occurring.

For example a matrix df:

import pandas as pd

df = pd.DataFrame({'TFD' : ['AA', 'SL', 'BB', 'D0', 'Dk', 'FF'],
                    'Snack' : ['1', '0', '1', '1', '0', '0'],
                    'Trans' : ['1', '1', '1', '0', '0', '1'],
                    'Dop' : ['1', '0', '1', '0', '1', '1']}).set_index('TFD')

print df

>>> 
    Dop Snack Trans
TFD                
AA    1     1     1
SL    0     0     1
BB    1     1     1
D0    0     1     0
Dk    1     0     0
FF    1     0     1

[6 rows x 3 columns]

would yield:

    Dop Snack Trans

Dop   0     2     3
Snack 2     0     2
Trans 3     2     0

Since the matrix is mirrored on the diagonal I guess there would be a way to optimize code.

Contents hide

Answers:

Method 1

Method 2

Method 3

Method 4

Usage:

output

Answers:

Thank you for visiting the Q&A section on Magenaut. Please note that all the answers may not help you solve the issue immediately. So please treat them as advisements. If you found the post helpful (or not), leave a comment & I’ll get back to you as soon as possible.

Method 1

It’s a simple linear algebra, you multiply matrix with its transpose (your example contains strings, don’t forget to convert them to integer):

>>> df_asint = df.astype(int)
>>> coocc = df_asint.T.dot(df_asint)
>>> coocc
       Dop  Snack  Trans
Dop      4      2      3
Snack    2      3      2
Trans    3      2      4

if, as in R answer, you want to reset diagonal, you can use numpy’s fill_diagonal:

>>> import numpy as np
>>> np.fill_diagonal(coocc.values, 0)
>>> coocc
       Dop  Snack  Trans
Dop      0      2      3
Snack    2      0      2
Trans    3      2      0

Method 2

Demo in NumPy:

import numpy as np
np.random.seed(3) # for reproducibility

# Generate data: 5 labels, 10 examples, binary.
label_headers = 'Alice Bob Carol Dave Eve'.split(' ')
label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.
print('labels:n{0}'.format(label_data))

# Compute cooccurrence matrix 
cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
print('ncooccurrence_matrix:n{0}'.format(cooccurrence_matrix)) 

# Compute cooccurrence matrix in percentage
# FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
#      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
with np.errstate(divide='ignore', invalid='ignore'):
    cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
print('ncooccurrence_matrix_percentage:n{0}'.format(cooccurrence_matrix_percentage))

Output:

labels:
[[0 0 1 1 0]
 [0 0 1 1 1]
 [0 1 1 1 0]
 [1 1 0 0 0]
 [0 1 1 0 0]
 [0 1 0 0 0]
 [0 1 0 1 1]
 [0 1 0 0 1]
 [1 0 0 1 0]
 [1 0 1 1 1]]

cooccurrence_matrix:
[[3 1 1 2 1]
 [1 6 2 2 2]
 [1 2 5 4 2]
 [2 2 4 6 3]
 [1 2 2 3 4]]

cooccurrence_matrix_percentage:
[[ 1.          0.33333333  0.33333333  0.66666667  0.33333333]
 [ 0.16666667  1.          0.33333333  0.33333333  0.33333333]
 [ 0.2         0.4         1.          0.8         0.4       ]
 [ 0.33333333  0.33333333  0.66666667  1.          0.5       ]
 [ 0.25        0.5         0.5         0.75        1.        ]]

With a heatmap using matplotlib:

import numpy as np
np.random.seed(3) # for reproducibility

import matplotlib.pyplot as plt


def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: http://stackoverflow.com/a/25074150/395857 
    By HYRY
    '''
    from itertools import izip
    pc.update_scalarmappable()
    ax = pc.get_axes()
    for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)

def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: http://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)

def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels):
    '''
    Inspired by:
    - http://stackoverflow.com/a/16124677/395857 
    - http://stackoverflow.com/a/25074150/395857
    '''

    # Plot it out
    fig, ax = plt.subplots()    
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    ax.invert_yaxis()
    ax.xaxis.tick_top()

    # resize 
    fig = plt.gcf()
    fig.set_size_inches(cm2inch(40, 20))



def main():

    # Generate data: 5 labels, 10 examples, binary.
    label_headers = 'Alice Bob Carol Dave Eve'.split(' ')
    label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.
    print('labels:n{0}'.format(label_data))

    # Compute cooccurrence matrix 
    cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
    print('ncooccurrence_matrix:n{0}'.format(cooccurrence_matrix)) 

    # Compute cooccurrence matrix in percentage
    # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
    #      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
    cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
    with np.errstate(divide='ignore', invalid='ignore'):
        cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
    print('ncooccurrence_matrix_percentage:n{0}'.format(cooccurrence_matrix_percentage))

    # Add count in labels
    label_header_with_count = [ '{0} ({1})'.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)]  
    print('nlabel_header_with_count: {0}'.format(label_header_with_count))

    # Plotting
    x_axis_size = cooccurrence_matrix_percentage.shape[0]
    y_axis_size = cooccurrence_matrix_percentage.shape[1]
    title = "Co-occurrence matrixn"
    xlabel= ''#"Labels"
    ylabel= ''#"Labels"
    xticklabels = label_header_with_count
    yticklabels = label_header_with_count
    heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels)
    plt.savefig('image_output.png', dpi=300, format='png', bbox_inches='tight') # use format='svg' or 'pdf' for vectorial pictures
    #plt.show()


if __name__ == "__main__":
    main()
    #cProfile.run('main()') # if you want to do some profiling

(PS: a neat visualization of a co-occurrence matrix in D3.js.)

Method 3

In case that you have larger corpus and term-frequency matrix, using sparse matrix multiplication might be more efficient. I use the same trick of matrix multiplication refered to algo answer on this page.

import scipy.sparse as sp
X = sp.csr_matrix(df.astype(int).values) # convert dataframe to sparse matrix
Xc = X.T * X # multiply sparse matrix # 
Xc.setdiag(0) # reset diagonal
print(Xc.todense()) # to print co-occurence matrix in dense format

Xc here will be the co-occurence matrix in sparse csr format

Method 4

To further elaborate this question, If you want to construct co-occurrence matrix from sentences you can do this:

import numpy as np
import pandas as pd

def create_cooccurrence_matrix(sentences, window_size=2):
    """Create co occurrence matrix from given list of sentences.

    Returns:
    - vocabs: dictionary of word counts
    - co_occ_matrix_sparse: sparse co occurrence matrix

    Example:
    ===========
    sentences = ['I love nlp',    'I love to learn',
                 'nlp is future', 'nlp is cool']

    vocabs,co_occ = create_cooccurrence_matrix(sentences)

    df_co_occ  = pd.DataFrame(co_occ.todense(),
                              index=vocabs.keys(),
                              columns = vocabs.keys())

    df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]

    df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')

    """
    import scipy
    import nltk

    vocabulary = {}
    data = []
    row = []
    col = []

    tokenizer = nltk.tokenize.word_tokenize

    for sentence in sentences:
        sentence = sentence.strip()
        tokens = [token for token in tokenizer(sentence) if token != u""]
        for pos, token in enumerate(tokens):
            i = vocabulary.setdefault(token, len(vocabulary))
            start = max(0, pos-window_size)
            end = min(len(tokens), pos+window_size+1)
            for pos2 in range(start, end):
                if pos2 == pos:
                    continue
                j = vocabulary.setdefault(tokens[pos2], len(vocabulary))
                data.append(1.)
                row.append(i)
                col.append(j)

    cooccurrence_matrix_sparse = scipy.sparse.coo_matrix((data, (row, col)))
    return vocabulary, cooccurrence_matrix_sparse

Usage:

sentences = ['I love nlp',    'I love to learn',
             'nlp is future', 'nlp is cool']

vocabs,co_occ = create_cooccurrence_matrix(sentences)

df_co_occ  = pd.DataFrame(co_occ.todense(),
                          index=vocabs.keys(),
                          columns = vocabs.keys())

df_co_occ = df_co_occ.sort_index()[sorted(vocabs.keys())]

df_co_occ.style.applymap(lambda x: 'color: red' if x>0 else '')

# If not in jupyter notebook, print(df_co_occ)

output

All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0

0 0 votes

Article Rating