randpup.git
ngram.py

#!/bin/python3
# randpup - ngram text generator kernel module
# Copyright (C) 2025 ArcNyxx
# see LICENCE.MIT file for licensing information
def strings(samples: list[str]) -> None:
print(f"const int pup_strings_len = {len(samples)};")
print("const pup_string_t pup_strings[] = {")
for string in samples:
print(f"\tSTRING(\"{string}\"),")
print("};\n")
def repeats(samples: list[str, str, int, int]) -> None:
print(f"const int pup_repeats_len = {len(samples)};")
print("const pup_string_repeat_t pup_repeats[] = {")
for string, ch, mini, maxi in samples:
print(f"\t{{ \"{string}\", sizeof(\"{string}\") - 1, " +
f"{mini}, {maxi}, '{ch}' }},")
print("};\n")
def ngram(strings: list[str], n: int, label: str) -> None:
start_dict: dict[str, int] = {}
token_dict: dict[str, dict[str, int]] = {}
for string in strings:
if string[:n] in start_dict:
start_dict[string[:n]] += 1
else:
start_dict[string[:n]] = 1
for i in range(len(string) - n):
token = string[i:i + n]
token_dict.setdefault(token, {})
if string[i + n] in token_dict[token]:
token_dict[token][string[i + n]] += 1
else:
token_dict[token][string[i + n]] = 1
token_list: list[tuple[str, list[tuple[str, int]]]] = \
[ (key, sorted(list(value.items()), key=lambda t: t[1], \
reverse=True)) for key, value in token_dict.items() ]
ngram_indeces: dict[str, int] = { ngram[0]: index \
for index, ngram in enumerate(token_list) }
ngram_table: list[tuple[str, int, int]] = []
choice_table: list[tuple[int, int]] = []
index = 0
for (string, choice_list) in token_list:
ngram_table.append((string, sum(list(zip(*choice_list))[1]), index))
index += len(choice_list)
accum = 0
for (char, freq) in choice_list:
accum += freq
choice_table.append((ngram_indeces[string[1:] + char], accum))
choice_string = ""
ngram_string = ""
start_string = ""
for i, (string, prob1, choice) in enumerate(ngram_table):
ngram_string += f"\t{{ .ch = '{string[-1]}', .choice = {choice}, " \
f".prob = {prob1} }}, /* \"{string}\" {i + 1} */\n"
choice_string += f"\t/* \"{string}\" {i + 1} */\n"
for (follow, prob2) in choice_table[choice:]:
choice_string += f"\t{{ .next = {follow}, .prob = {prob2} }},\n"
if prob1 == prob2:
break
accum = 0
for string, num in start_dict.items():
accum += num
start_string += f"\t{{ .ch = '{string[0]}', " \
f".ngram = {ngram_indeces[string]}, .prob = {accum} }},\n"
print(f"const pup_choice_t pup_{label}_choice[] = {{")
print(choice_string, end='')
print("};\n")
print(f"const pup_ngram_t pup_{label}_ngram[] = {{")
print(ngram_string, end='')
print("};\n")
print(f"const pup_start_t pup_{label}_start[] = {{", end='')
if n > 1:
print("\n" + start_string, end='')
print("};\n")
print(f"const pup_gen_t pup_{label}_gen = {{")
print(f"\t.ngram = pup_{label}_ngram,")
print(f"\t.choice = pup_{label}_choice,")
print(f"\t.start = pup_{label}_start,")
print(f"\t.len = {len(ngram_table)},")
print(f"\t.prob = {accum if n > 1 else 0},")
print("};\n")
def ngrams(samples: list[tuple[list[str], int, str]]) -> None:
gendef_string = ""
for strings, n, label in samples:
ngram(strings, n, label)
gendef_string += f"\t{{ &pup_{label}_gen, 25, 75 }},\n"
print(f"const int pup_gens_len = {len(samples)};")
print("const pup_gendef_t pup_gens[] = {")
print(gendef_string, end='')
print("};")
print("/* randpup - ngram text generator kernel module")
print(" * Copyright (C) 2025 ArcNyxx")
print(" * see LICENCE file for licensing information */\n")
print("#include \"ngram.h\"\n")
strings([
"*paws at u*",
"*steals ur food*",
"*barks at nothing*",
"*jumps up on the couch*",
"*chews on the furniture*",
"*gnaws on ur arm*",
"*eats ur sock*",
"*slobbers everywhere*",
"*licks ur face*",
"*nips ur ear*",
"*runs into the street*",
"*knocks over lamp*",
"*scratches ur floor*",
"*nuzzles u*",
"*bites ur arm*"
])
repeats([
(":", "3", 1, 9),
(";", "3", 1, 9),
(">:", "3", 1, 9),
(">;", "3", 1, 9)
])
ngrams([
(("arf arf woof bark bark arf ruff ruff grr ruff bark ark ark arf", "bark bark ruff ruff arf arf grr ruff arf ark ark ark arf", "ruff bark ruff ruff bark bark grrr grrrr arf arf bark ruff"), 3, "barking1"),
(("bark bark ark arf arf grrrruff raff arfarf grrraff ruff ruff arfarfarf", "arf grr ruff arf arfruffruff bark"), 3, "barking2"),
(("arfarfarfbarkarkarkbarkruffaruffaruffruffgrrrarfarfarfruffruffarfarfarfruffbarkbarkgrrrrrrgrrarfarfarfruffbarkarkarkbarkbarkbarkbarkarf", "barkbarkbarkarfarfarkarkarkruffbarkruffgrrruffruffarfarkark"), 2, "barking3"),
(("arrrrrrr awoooo awoo awoooo auwoooo awuuuuu auooo arrr aurrr aruuuuuu awrooooo awoooo", "ouuuuuu awouuuu awwooOOOOOooo arruff aroooo arrrf awhuuuuuu auuuUUuuu arrRRRRUUUU", "woooooOOOoo aaaWWOOOOOO"), 2, "howling")
])