Visualizing Twitter Messages with Emojis

This example creates a vector space model for classify emojis in Twitter messages, then process and create vectors from messages and project them using a UMAP model. The projection uses the SimilaritySearch allknn operation.

using SimilaritySearch, SimSearchManifoldLearning, TextSearch, CodecZlib, JSON, DataFrames, Plots, StatsBase
using Downloads: download

downloading the dataset, parsing and vectorizing functions

mkpath("tmp")
dbfile = "tmp/emo50k.json.gz"
baseurl = "https://github.com/sadit/TextClassificationTutorial/raw/refs/heads/main/data/emo50k.json.gz"
!isfile(dbfile) && download(baseurl, dbfile)

false

Now, we load the dataset

D = DataFrame(open(GzipDecompressorStream, dbfile) do f
    JSON.parse.(eachline(f))
end)

collect(countmap(D.klass))

64-element Vector{Pair{String, Int64}}:
 "✨" => 801
 "🤤" => 771
 "😁" => 794
 "😡" => 776
 "😏" => 757
 "🤣" => 780
 "👌" => 779
 "😭" => 785
 "🤔" => 732
 "😈" => 774
      ⋮
 "🙄" => 748
 "💙" => 770
 "🙊" => 786
 "😘" => 815
 "🙈" => 772
 "💕" => 747
 "😑" => 812
 "😔" => 782
 "😳" => 839


D = filter(D) do r
    r.klass in ("😭", "🤣", "😍", "😤")
end

collect(countmap(D.klass))
#H = sort!(collect(countmap(D.klass)), by=first)
#bar(first.(H), last.(H))

4-element Vector{Pair{String, Int64}}:
 "🤣" => 780
 "😤" => 808
 "😭" => 785
 "😍" => 816

Functions create to encode texto into bag-of-word vectors

textconfig = TextConfig(
    group_usr=true,
    group_url=true,
    del_diac=true,
    lc=true,
    group_num=true,
    nlist=[1],
    qlist=[3])

# corpus here can be a sample to avoid double parsing
voc = Vocabulary(textconfig, D.text) 
# model = VectorModel(IdfWeighting(), TfWeighting(), voc)
model = VectorModel(EntropyWeighting(), BinaryLocalWeighting(), voc, D.text, D.klass; smooth=1.0)
#model = VectorModel(IdfWeighting(), TfWeighting(), voc)
model = filter_tokens(model) do t
    t.weight >= 0.075
end
vectors = vectorize_corpus(model, D.text)

UMAP projections

UMAP projection can take a while, even on multithreading systems. Note that we are creating 2d and 3d projections.

1e2, e3 = let min_dist=0.5f0,
             k=16,
             n_epochs=75,
             neg_sample_rate=3,
             tol=1e-3,
             layout=SpectralLayout(),
             indexsize=768,
             dist=NormalizedCosineDistance()

    index = ExhaustiveSearch(; db=rand(vectors, indexsize), dist)
    @time U2 = fit(UMAP, index; k, neg_sample_rate, layout, n_epochs, tol, min_dist)
    @time U3 = fit(U2, 3; neg_sample_rate, n_epochs, tol)
    @time e2 = clamp.(predict(U2, vectors), -10f0, 10f0)
    @time e3 = clamp.(predict(U3, vectors), -10f0, 10f0)
    e2, e3
end

1: The UMAP algorithm has a lot of hyperparameters; min_dist controls the distance between projected points, k is the number of neighbors to be used in the underlying \(k\)nn graph, n_epochs the number of epochs used to optimize the projection, neg_sample_rate means for the number of negative examples used in the optimization process, tol the tolerance to converge, layout

Visualizations

function normcolors(V)
    min_, max_ = extrema(V)
    V .= (V .- min_) ./ (max_ - min_)
    V .= clamp.(V, 0, 1)
end

normcolors(@view e3[1, :])
normcolors(@view e3[2, :])
normcolors(@view e3[3, :])

C = [RGB(c[1], c[2], c[3]) for c in eachcol(e3)]

X = @view e2[1, :]
Y = @view e2[2, :]
scatter(X, Y, color=C, markersize=4, alpha=0.5)

for i in 1:100
    j = rand(1:length(D.klass))
    annotate!(X[j], Y[j], text(D.klass[j], :blue, :right, 8, "noto"))
end

plot!()

Environment and dependencies

Julia Version 1.10.9
Commit 5595d20a287 (2025-03-10 12:51 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 64 × Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, cascadelake)
Threads: 64 default, 0 interactive, 32 GC (on 64 virtual cores)
Environment:
  JULIA_PROJECT = .
  JULIA_NUM_THREADS = auto
  JULIA_LOAD_PATH = @:@stdlib
Status `~/sites/SimilaritySearchDemos/Project.toml`
  [aaaa29a8] Clustering v0.15.8
  [944b1d66] CodecZlib v0.7.8
  [a93c6f00] DataFrames v1.7.0
  [c5bfea45] Embeddings v0.4.6
  [f67ccb44] HDF5 v0.17.2
  [b20bd276] InvertedFiles v0.8.0 `~/.julia/dev/InvertedFiles`
  [682c06a0] JSON v0.21.4
  [23fbe1c1] Latexify v0.16.6
  [eb30cadb] MLDatasets v0.7.18
  [06eb3307] ManifoldLearning v0.9.0
⌃ [ca7969ec] PlotlyLight v0.11.0
  [91a5bcdd] Plots v1.40.11
  [27ebfcd6] Primes v0.5.7
  [ca7ab67e] SimSearchManifoldLearning v0.3.0 `~/.julia/dev/SimSearchManifoldLearning`
  [053f045d] SimilaritySearch v0.12.0 `~/.julia/dev/SimilaritySearch`
⌅ [2913bbd2] StatsBase v0.33.21
  [f3b207a7] StatsPlots v0.15.7
  [7f6f6c8a] TextSearch v0.19.0 `~/.julia/dev/TextSearch`
Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated`