get_related_CHEATING = py`
import numpy as np
pd = ${pd}
import pylev #${otherKglabDependencies} ${packagesPy}
import pickle
term_ratio = ${term_ratio}
def get_related (model, query, target, n=20, granularity=100):
"""return a DataFrame of the closely related items"""
try:
bins = np.linspace(0, 1, num=granularity, endpoint=True)
print("CHEATING USING gensim PICKLE FROM PYTHON")
v = pickle.loads(${await FileAttachment(
"v.pkl"
).arrayBuffer()}.tobytes())
#v = sorted(
# model.wv.most_similar(positive=[query], topn=n),
# key=lambda x: x[1],
# reverse=True
#)
df = pd.DataFrame(v, columns=["ingredient", "similarity"])
s = df["similarity"]
quantiles = s.quantile(bins, interpolation="nearest")
df["sim_pct"] = np.digitize(s, quantiles) - 1
df["levenshtein"] = [ pylev.levenshtein(d, query) / len(query) for d in df["ingredient"] ]
s = df["levenshtein"]
quantiles = s.quantile(bins, interpolation="nearest")
df["lev_pct"] = granularity - np.digitize(s, quantiles)
df["term_ratio"] = [ term_ratio(target, d) for d in df["ingredient"] ]
return df
except KeyError:
return pd.DataFrame(columns=["ingredient", "similarity", "percentile"])
get_related`