using SimilaritySearch, SimSearchManifoldLearning, TextSearch, CodecZlib, JSON, DataFrames, PlotlyLight, StatsBase
using Downloads: downloadVisualizing Twitter Messages with Emojis
This example creates a vector space model for classify emojis in Twitter messages, then process and create vectors from messages and project them using a UMAP model. The projection uses the SimilaritySearch allknn operation.
downloading the dataset, parsing and vectorizing functions
mkpath("tmp")
dbfile = "tmp/emo50k.json.gz"
baseurl = "https://github.com/sadit/TextClassificationTutorial/raw/refs/heads/main/data/emo50k.json.gz"
!isfile(dbfile) && download(baseurl, dbfile)false
Now, we load the dataset
D = DataFrame(open(GzipDecompressorStream, dbfile) do f
JSON.parse.(eachline(f))
end)
collect(countmap(D.klass))64-element Vector{Pair{String, Int64}}:
"✨" => 801
"🤤" => 771
"😁" => 794
"😡" => 776
"😏" => 757
"🤣" => 780
"👌" => 779
"😭" => 785
"🤔" => 732
"😈" => 774
⋮
"🙄" => 748
"💙" => 770
"🙊" => 786
"😘" => 815
"🙈" => 772
"💕" => 747
"😑" => 812
"😔" => 782
"😳" => 839
D = filter(D) do r
r.klass in ("😭", "🤣", "😍", "😤")
end
collect(countmap(D.klass))
#H = sort!(collect(countmap(D.klass)), by=first)
#bar(first.(H), last.(H))4-element Vector{Pair{String, Int64}}:
"🤣" => 780
"😤" => 808
"😭" => 785
"😍" => 816
Functions create to encode texto into bag-of-word vectors
textconfig = TextConfig(
group_usr=true,
group_url=true,
del_diac=true,
lc=true,
group_num=true,
nlist=[1],
qlist=[3])
# corpus here can be a sample to avoid double parsing
voc = Vocabulary(textconfig, D.text)
# model = VectorModel(IdfWeighting(), TfWeighting(), voc)
model = VectorModel(EntropyWeighting(), BinaryLocalWeighting(), voc, D.text, D.klass; smooth=1.0)
#model = VectorModel(IdfWeighting(), TfWeighting(), voc)
model = filter_tokens(model) do t
t.weight >= 0.075
end
vectors = VectorDatabase(vectorize_corpus(model, D.text))UMAP projections
UMAP projection can take a while, even on multithreading systems. Note that we are creating 2d and 3d projections.
1e2, e3 = let min_dist=0.5f0,
k=16,
n_epochs=75,
neg_sample_rate=3,
tol=1e-3,
layout=SpectralLayout(),
indexsize=768,
dist=Dist.NormCosine()
index = ExhaustiveSearch(; db=rand(vectors, indexsize), dist)
@time U2 = fit(UMAP, index; k, neg_sample_rate, layout, n_epochs, tol, min_dist)
@time U3 = fit(U2, 3; neg_sample_rate, n_epochs, tol)
@time e2 = clamp.(predict(U2, vectors), -10f0, 10f0)
@time e3 = clamp.(predict(U3, vectors), -10f0, 10f0)
e2, e3
end- 1
-
The UMAP algorithm has a lot of hyperparameters;
min_distcontrols the distance between projected points,kis the number of neighbors to be used in the underlying \(k\)nn graph,n_epochsthe number of epochs used to optimize the projection,neg_sample_ratemeans for the number of negative examples used in the optimization process,tolthe tolerance to converge,layout
Visualizations
Plots
function normcolors!(V)
min_, max_ = extrema(V)
range = max_ - min_
if range > 0
V .= (V .- min_) ./ range
end
V .= clamp.(V, 0, 1)
end
normcolors!(@view e3[1, :])
normcolors!(@view e3[2, :])
normcolors!(@view e3[3, :])
colors = [
"rgba($(round(Int, c[1]*255)), $(round(Int, c[2]*255)), $(round(Int, c[3]*255)), 0.3)"
for c in eachcol(e3)
]
data = [Config(;
x = view(e2, 1, :),
y = view(e2, 2, :),
mode = "markers",
marker = (
color = colors,
size = 4,
line = (width = 0,)
),
hovertext = D.klass,
type = "scattergl"
)]
layout = Config(
width = 600,
height = 600,
xaxis = (visible = false, showgrid = false, zeroline = false),
yaxis = (visible = false, showgrid = false, zeroline = false),
hovermode = "closest",
plot_bgcolor = "white"
)
Plot(data, layout)Environment and dependencies
Julia Version 1.10.11 Commit a2b11907d7b (2026-03-09 14:59 UTC) Build Info: Official https://julialang.org/ release Platform Info: OS: macOS (x86_64-apple-darwin24.0.0) CPU: 8 × Intel(R) Core(TM) i5-8257U CPU @ 1.40GHz WORD_SIZE: 64 LIBM: libopenlibm LLVM: libLLVM-15.0.7 (ORCJIT, skylake) Threads: 8 default, 0 interactive, 4 GC (on 8 virtual cores) Environment: JULIA_NUM_THREADS = auto JULIA_PROJECT = @. JULIA_LOAD_PATH = @:@stdlib Status `~/Research/SimilaritySearchDemos/Project.toml` [aaaa29a8] Clustering v0.15.8 [944b1d66] CodecZlib v0.7.8 [5ae59095] Colors v0.13.1 [a93c6f00] DataFrames v1.8.1 [c5bfea45] Embeddings v0.4.6 [f67ccb44] HDF5 v0.17.2 [916415d5] Images v0.26.2 [b20bd276] InvertedFiles v0.9.2 ⌅ [682c06a0] JSON v0.21.4 [23fbe1c1] Latexify v0.16.10 [eb30cadb] MLDatasets v0.7.21 [06eb3307] ManifoldLearning v0.9.0 ⌃ [ca7969ec] PlotlyLight v0.11.0 [27ebfcd6] Primes v0.5.7 [ca7ab67e] SimSearchManifoldLearning v0.4.0 [053f045d] SimilaritySearch v0.14.3 ⌅ [2913bbd2] StatsBase v0.33.21 [7f6f6c8a] TextSearch v0.20.0 Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated`