Cosinus Ähnlichkeit, Mindmaps#

Autor: J.Busse, 6/2021, 2022-04-19

Lizenz: public domain / CC 0

Zur Weiterbearbeitung durch Studierende im Rahmen der LV dsci-txt

Dieses Programm zitieren:

  • Busse 2021-06-16: Cosinus Ähnlichkeit, Mindmaps. IPYNB-Notebook, April 2022

import numpy as np
import pandas as pd

Global Parameters#

# path to files, incl. glob mask

path_to_files = "mm/*.mm"

# show intermediary results
# 0 none, 1 informative, 2 debug
verbosity = 2
def verbose(level,item):
    if level <= verbosity:
        display(item)

Read Filenames#

# https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
import glob
files = glob.glob(path_to_files)
verbose(2,files)
[]

read mindmaps#

map einlesen, Liste von MAPS

import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring, ElementTree
def walk_and_collect_dict(node, parent_text, resultdict):
    """walk mindmap, collect n-grams into resultdict"""
    
    myText = node.get('TEXT')
    
    # textAnalysiert = SpaCy.nlp(myText)
    
    # basic bag of word (WOW) items: the terms themself
    resultdict[ "A_" + myText ] = 1
    
    # add n-gram to BOW, e.g. parent<-chlild
    resultdict[ "B_" + parent_text + "|" + myText ] = 1
    
    # add term  plus time stamp of node creation to BOW
    #resultdict[ "C_" + myText + "_" + node.get('CREATED') ] = 1

    # add CREADTED to BOW
    #resultdict[ "D_" + "CREATED_" + node.get('CREATED') ] = 1
    
    # add MODIFIED to BOW
    # resultdict[ "E_" + "MODIFIED_" + node.get('MODIFIED') ] = 1
    
    
    for child in node.findall('node'):
        walk_and_collect_dict(child, myText, resultdict)
def read_mm_files(files):
    corpus = {}
    
    # walk through all files
    for file in files:
        # verbose(3,file)
    
        # load file as an XML element tree
        with open(file) as file_ref:
            verbose(2, "reading {}".format(file_ref))
            # https://docs.python.org/2/library/xml.etree.elementtree.html#parsing-xml
            
            # parse mindmap file
            tree = ET.parse(file_ref)
            
            # point root to xml root-element "/map"
            root = tree.getroot()
            
            tokens = {}
            for n in root.findall('node/node'):
                walk_and_collect_dict(n, "TOP", tokens)
            
        corpus[file] = tokens
    return corpus
corpus_dict = read_mm_files(files)
corpus_dict
{}
# https://www.geeksforgeeks.org/how-to-create-dataframe-from-dictionary-in-python-pandas/
# Method 6: Create DataFrame from nested Dictionary.

# nicht verändern
corpus_df = pd.DataFrame(corpus_dict).T.fillna(0)

# zeigen
corpus_df.T
# nur eine bestimmte Klasse von Spalten betrachten,
# hier: Alle Spalten, die mit 'C' beginnen
[ c for c in corpus_df.columns if c[0] == 'C']
[]
# falls man das tun will:
# den gesamten Korpus in ein Dictionary von Korpora aufteilen
corpus_df_dict = {}
typliste = ['A', 'B', 'C', 'D']
for t in typliste:
    Auswahl = [ c for c in corpus_df.columns if c[0] == t]
    print(t, Auswahl)
    corpus_df_dict[t] = corpus_df[Auswahl]
    
A []
B []
C []
D []
for t in typliste:
    display(corpus_df_dict[t].T)
# falls man das tun will: nur ausgewählte betrachten?
#corpus_df =  corpus_df_dict['C', 'D']
#corpus_df

TfIdf#

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
verbose(1,transformer)
TfidfTransformer(smooth_idf=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
tfidf = transformer.fit_transform(corpus_df)
tfidf
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In [16], line 1
----> 1 tfidf = transformer.fit_transform(corpus_df)
      2 tfidf

File ~/miniconda3/lib/python3.9/site-packages/sklearn/base.py:867, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    863 # non-optimized default implementation; override when a better
    864 # method is possible for a given clustering algorithm
    865 if y is None:
    866     # fit method of arity 1 (unsupervised transformation)
--> 867     return self.fit(X, **fit_params).transform(X)
    868 else:
    869     # fit method of arity 2 (supervised transformation)
    870     return self.fit(X, y, **fit_params).transform(X)

File ~/miniconda3/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1623, in TfidfTransformer.fit(self, X, y)
   1605 """Learn the idf vector (global term weights).
   1606 
   1607 Parameters
   (...)
   1618     Fitted transformer.
   1619 """
   1620 # large sparse data is not supported for 32bit platforms because
   1621 # _document_frequency uses np.bincount which works on arrays of
   1622 # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
-> 1623 X = self._validate_data(
   1624     X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT
   1625 )
   1626 if not sp.issparse(X):
   1627     X = sp.csr_matrix(X)

File ~/miniconda3/lib/python3.9/site-packages/sklearn/base.py:577, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    575     raise ValueError("Validation should be done on X, y or both.")
    576 elif not no_val_X and no_val_y:
--> 577     X = check_array(X, input_name="X", **check_params)
    578     out = X
    579 elif no_val_X and not no_val_y:

File ~/miniconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:768, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    764     pandas_requires_conversion = any(
    765         _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
    766     )
    767     if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
--> 768         dtype_orig = np.result_type(*dtypes_orig)
    770 if dtype_numeric:
    771     if dtype_orig is not None and dtype_orig.kind == "O":
    772         # if input is object, convert to float.

File <__array_function__ internals>:180, in result_type(*args, **kwargs)

ValueError: at least one array or dtype is required
verbose(1,pd.DataFrame(tfidf.toarray()))

Cosine Similarity#

Wir machen es hier eher low level, um unter die Motorhaube sehen zu können:

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tfidf)
verbose(2,similarity)
similarity_df = pd.DataFrame(similarity)
similarity_df.columns = files 
similarity_df.index = files
similarity_df
import seaborn as sns
mask = np.zeros_like(similarity)
mask[np.triu_indices_from(mask)] = True
ax = sns.heatmap(similarity_df, mask=mask,annot= True , cmap = 'RdBu')
ax = sns.clustermap(similarity_df,annot= True , cmap = 'RdBu')
ax.savefig("clustermap.png")

https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html