# SentSim

## Imports and Setup

In [23]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import os
import sys

In [24]:
# Column index of feedback in CSV
data_column = 2
# Header row number to start parsing after
data_row = 1
# How many total test examples are provided
num_tests = 5
test_file = 'tests.csv'
# Path to desired SentenceTransformer model (relative to MYPATH)
model_path = 'models/sentence-transformers_all-mpnet-base-v2'

In [25]:
# Get the path to the install folder set in run.sh
MYPATH = os.getenv('MYPATH')
directory = MYPATH+'data'

In [26]:
file_list = os.listdir(directory)
file_path = os.path.join(directory, file_list[0])
print(file_path)

/projectnb/rcs-intern/SentSim/data/mydata.csv


## Data Input

In [27]:
data = pd.read_csv(file_path, keep_default_na=False, header=None, encoding = 'unicode_escape', engine ='python')
data

Unnamed: 0,0,1,2,3
0,ID,Score,Feedback,Notes
1,1,2,I would change the dining hall to have more ty...,Apple
2,17,2,I dont like having to walk to class.,Banana
3,39,9,More buses,Cucumber
4,235423,6,There are too many mosquitoes on campus. They ...,Duck


In [28]:
tests = pd.read_csv(MYPATH+test_file, keep_default_na=False, header=None, encoding = 'unicode_escape', engine ='python')
tests

Unnamed: 0,0
0,size of classes
1,mass transit
2,public safety
3,quality of food
4,robot uprising


In [29]:
tests_text = tests.loc[:][0]
# SentenceTransformer wants indexing to start at 0, so we reset
#     the index and drop the old indices
feedback_text = data.loc[data_row:][data_column].reset_index(drop=True)
feedback_text

0    I would change the dining hall to have more ty...
1                 I dont like having to walk to class.
2                                           More buses
3    There are too many mosquitoes on campus. They ...
Name: 2, dtype: object

## Using the model

In [30]:
# path to the folder that contains the config.json file
# This uses the following HF model:
#  https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer(MYPATH+model_path)

In [32]:
tests = model.encode(tests_text)
feedback = model.encode(feedback_text)

In [33]:
print(tests.shape)
print(feedback.shape)
tests

(5, 768)
(4, 768)


array([[ 0.0053131 ,  0.01246636, -0.00990537, ...,  0.02759881,
        -0.016683  ,  0.01243237],
       [-0.02030246,  0.01770944, -0.00373941, ..., -0.04288072,
        -0.03075187, -0.01293493],
       [-0.00247475,  0.00174779,  0.01997342, ..., -0.02406573,
         0.01295812, -0.00414423],
       [ 0.00664292,  0.08051313, -0.02354828, ...,  0.03988044,
         0.04549434,  0.00696456],
       [ 0.00536648,  0.0683189 ,  0.022613  , ...,  0.04422509,
        -0.03214768, -0.01640472]], dtype=float32)

In [34]:
similarity = util.cos_sim(feedback, tests)
similarity

tensor([[ 0.1447,  0.0726,  0.0951,  0.1393,  0.0647],
        [ 0.1878,  0.3848,  0.1217, -0.0127,  0.0539],
        [ 0.2149,  0.5412,  0.3327,  0.1264,  0.1854],
        [ 0.1083,  0.0889,  0.1487,  0.0124,  0.0532]])

In [35]:
print(tests_text)
print(feedback_text)

0    size of classes
1       mass transit
2      public safety
3    quality of food
4     robot uprising
Name: 0, dtype: object
0    I would change the dining hall to have more ty...
1                 I dont like having to walk to class.
2                                           More buses
3    There are too many mosquitoes on campus. They ...
Name: 2, dtype: object


## Examples of behavior
### World Knowledge

In [36]:
def my_compare(a,b):
    temp =  util.cos_sim(model.encode(a), model.encode(b)).tolist()[0]
    for num in temp:
        print(f'{num:.4f}')

def my_compare2(a,b):
    return util.cos_sim(model.encode(a), model.encode(b))

In [37]:
a = "The first president of the United States had dogs."
b = "George Washington owned four French hounds."
my_compare(a,b)

0.7027


In [38]:
c = "Michael Phelps swam really well."
my_compare(c,[a,b])

0.1135
0.0883


### Score scaling

In [39]:
d = "trichloroaetic acid isopropyl ester"
my_compare(d,[a,c])

-0.0642
-0.0111


In [40]:
e = "John Adams wrote 1100 letters to his wife."
my_compare(e,[a,b,c])

0.2757
0.3410
0.0324


In [41]:
f = "George Washington owned three houses."
my_compare(f,[a,b,c])

0.4964
0.6256
0.0925


### Score dilution

In [42]:
g = "George Washington owned four French hounds. A big old well-worn sofa is very comfy to sit in."
my_compare(g,[a,b,c])

0.5964
0.7646
0.0587


### Score smearing

In [43]:
h = "George Washington owned four French hounds. Michael Phelps swam really well."
my_compare(h,[a,b,c,d,e])

0.6073
0.7359
0.5617
-0.0794
0.2085


### Ordering matters

In [44]:
h = "Michael Phelps swam really well. George Washington owned four French hounds."
my_compare(h,[a,b,c,d,e])

0.5532
0.6585
0.6619
-0.0727
0.2046


### Further smearing

In [45]:
i = "Michael Phelps swam really well. George Washington owned four French hounds. trichloroaetic acid isopropyl ester"
my_compare2([h,i],[a,b,c,d,e])

tensor([[ 0.5532,  0.6585,  0.6619, -0.0727,  0.2046],
        [ 0.5001,  0.5478,  0.5280,  0.3385,  0.1875]])

In [None]:
a = "giant happy sweet lady dancing on summer day"
b = "tiny miserable sour man comatose under winter night"
my_compare(a,b)

# Analysis
### Challenges
* Multiple topics in same feedback
### Approaches
* Splitting: punctuation (periods, exclamation points, commas)
* Direct similarity calculation: thresholding, SNR
* Clustering: explicit and implicit 