ex8_0 Vector embedding with gensim (*ref only) / Thad Kerosky

Thad Kerosky

Workspace

Published

kglab.js (alpha) examples

Edited

Dec 29, 2021

Fork of ex6_1 Discover community structure using igraph and leidenalg (*ref only)

1 star

kglab.js (alpha) examples

kglab in ObservableHQ ex0_0 Data Sources ex1_0 Building a graph in RDF using rdflib ex1_1 Leveraging the kglab abstraction layer ex2_0 Build a medium size KG from a CSV dataset ex3_0 Interactive graph visualization with pyvis ex4_0 Running SPARQL queries ex5_0 SHACL validation with pySHACL ex6_0 Graph algorithms with networkx ex6_1 Discover community structure using igraph and leidenalg (*ref only)ex7_0 Measurement and inference ex7_1 Statistical Relational Learning with pslpython (*ref only)

ex8_0 Vector embedding with gensim (*ref only)

live-edit visual of SPARQL query structure

SYNONYMS = py`

#the "eval" code makes SYNONYMS not JSON.stringify well enough for python, so it is inlined.

SYNONYMS = {

"pepper": "black pepper",

"black pepper": "black pepper",

"egg": "egg",

"eggs": "egg",

"vanilla": "vanilla",

"vanilla extract": "vanilla",

"flour": "flour",

"all-purpose flour": "flour",

"onions": "onion",

"onion": "onion",

"carrots": "carrot",

"carrot": "carrot",

"potatoes": "potato",

"potato": "potato",

"tomatoes": "tomato",

"fresh tomatoes": "tomato",

"fresh tomato": "tomato",

"garlic": "garlic",

"garlic clove": "garlic",

"garlic cloves": "garlic",

}

SYNONYMS`

ingredientset_from_recipes = py`

SYNONYMS =${SYNONYMS}

import csv

import os,io

MAX_ROW = 250000 # 231638

max_context = 0

min_context = 1000

recipes = []

vocab = set()

with io.open('all_ind.csv','wb') as f:

f.write(${await kgzip

.file("kglab-main/dat/all_ind.csv")

.arrayBuffer()}.tobytes())

with open("all_ind.csv", "r") as f:

reader = csv.reader(f)

next(reader, None) # remove file header

for i, row in enumerate(reader):

id = row[0]

ind_set = set()

# substitute synonyms

for ind in set(eval(row[3])):

if ind in SYNONYMS:

ind_set.add(SYNONYMS[ind])

else:

ind_set.add(ind)

if len(ind_set) > 1:

recipes.append([id, ind_set])

vocab.update(ind_set)

max_context = max(max_context, len(ind_set))

min_context = min(min_context, len(ind_set))

if i > MAX_ROW:

break

{"max context": str(max_context) + " unique ingredients per recipe",

"min context": str(min_context) + " unique ingredients per recipe",

"vocab size": str(len(list(vocab))), "recipes": recipes }

py`

import pickle

recipes=${ingredientset_from_recipes.get("recipes")}

pickle.dump(recipes, open("tmp.pkl", "wb"))

recipes[:3]

py`

vectors = [

[

ind

for ind in ind_set

]

for id, ind_set in ${ingredientset_from_recipes.get("recipes")}

]

vectors[:3]

model = py`

MIN_COUNT = 2

model_path = "tmp.w2v"

model = ${gensim}.models.Word2Vec(vectors, min_count=MIN_COUNT, window=max_context)

model.save(model_path)`

term_ratio = py`

import numpy as np

import pylev #${otherKglabDependencies}

def term_ratio (target, description):

d_set = set(description.split(" "))

num_inter = len(d_set.intersection(target))

return num_inter / float(len(target))

term_ratio`

get_related_CHEATING = py`

import numpy as np

pd = ${pd}

import pylev #${otherKglabDependencies} ${packagesPy}

import pickle

term_ratio = ${term_ratio}

def get_related (model, query, target, n=20, granularity=100):

"""return a DataFrame of the closely related items"""

try:

bins = np.linspace(0, 1, num=granularity, endpoint=True)

print("CHEATING USING gensim PICKLE FROM PYTHON")

v = pickle.loads(${await FileAttachment(

"v.pkl"

).arrayBuffer()}.tobytes())

#v = sorted(

# model.wv.most_similar(positive=[query], topn=n),

# key=lambda x: x[1],

# reverse=True

df = pd.DataFrame(v, columns=["ingredient", "similarity"])

s = df["similarity"]

quantiles = s.quantile(bins, interpolation="nearest")

df["sim_pct"] = np.digitize(s, quantiles) - 1

df["levenshtein"] = [ pylev.levenshtein(d, query) / len(query) for d in df["ingredient"] ]

s = df["levenshtein"]

quantiles = s.quantile(bins, interpolation="nearest")

df["lev_pct"] = granularity - np.digitize(s, quantiles)

df["term_ratio"] = [ term_ratio(target, d) for d in df["ingredient"] ]

return df

except KeyError:

return pd.DataFrame(columns=["ingredient", "similarity", "percentile"])

get_related`

df = py`

${pd}.set_option("max_rows", None)

target = set([ "basil" ])

get_related = ${get_related_CHEATING}

#model =

#CHEATING USING PICKLE TO MAKE SURE pylev works in pyodide, it does.

df = get_related("NOT THE MODEL ACTUALLY", "dried basil", target, n=50)

df`

table(df.to_dict("records").toJs().map(Object.fromEntries))

py`

import matplotlib

plt = ${plt}

df = ${df}

matplotlib.style.use("ggplot")

df["similarity"].plot(alpha=0.75, rot=0)

plt.show()

df_ranked = py`

from kglab import root_mean_square #${kglab}

def rank_related (df):

df2 = df.copy(deep=True)

df2["related"] = df2.apply(lambda row: root_mean_square([ row[2], row[4] ]), axis=1)

return df2.sort_values(by=["related"], ascending=False)

df = rank_related(${df})

table(df_ranked.to_dict("records").toJs().map(Object.fromEntries))

df_ranked_filtered = py`

df = ${df_ranked}

df.loc[ (df["related"] >= 50) & (df["term_ratio"] > 0) ]

table(df_ranked_filtered.to_dict("records").toJs().map(Object.fromEntries))

/* Patch plt.show() to work in Observable from https://observablehq.com/@gnestor/pyodide-demo */

plt = py`

import matplotlib

from matplotlib import pyplot as plt #${packagesPy}

import types

import io

import base64

from js import document

def show(self):

buf = io.BytesIO()

self.savefig(buf, format='png')

buf.seek(0)

img_str = 'data:image/png;base64,' + base64.b64encode(buf.read()).decode('UTF-8')

el = document.createElement('img')

el.src = img_str

return el

plt._show = types.MethodType(plt.show, plt)

plt.show = types.MethodType(show, plt)

plt`

gensim = py`import gensim

gensim`

pd = py`import pandas as pd #${packagesPy}

pd`

kglab = py`

import kglab

# ${otherKglabDependencies} ${kglabDependency} ${micropipDependenciesPy}

kglab

py`dir(${kglab})`

py`repr(${kglab})`

import { py, pyodide } from "@thadk/pyodide-18"

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more