TamiLM-Telegram-Bot/bot_hf_tokenizers_cuda.py

#!/usr/bin/env python
# pylint: disable=unused-argument
# This program is dedicated to the public domain under the CC0 license.

"""
Simple Bot to reply to Telegram messages.

First, a few handler functions are defined. Then, those functions are passed to
the Application and registered at their respective places.
Then, the bot is started and runs until we press Ctrl-C on the command line.

Usage:
Basic Echobot example, repeats messages.
Press Ctrl-C on the command line or send a signal to the process to stop the
bot.
"""

import math
########################################################################################################
# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
########################################################################################################

import numpy as np
np.set_printoptions(precision=4, suppress=True, linewidth=200)
import types, torch, copy, time
from typing import List
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
# torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
torch._C._jit_set_autocast_mode(False)

import torch.nn as nn
from torch.nn import functional as F

MyModule = torch.jit.ScriptModule
MyFunction = torch.jit.script_method
MyStatic = torch.jit.script

########################################################################################################

from tokenizers import Tokenizer
from tokenizers import decoders
tokenizer = Tokenizer.from_file("tokenizer_vocab/tokenizer-tami-wordpiece-hedc4.json")
tokenizer.decoder = decoders.WordPiece()

args = types.SimpleNamespace()
args.MODEL_NAME = 'tami-models/rwkv-final-maybe-knesset'
args.n_layer = 12
args.n_embd = 512
args.vocab_size = 65536
args.head_size = 64

#context = "מחפשת בדחיפות  מחשב נייד הכי פשוט עם מטען"


########################################################################################################

class RWKV_RNN(MyModule):
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.n_embd = args.n_embd
        self.n_layer = args.n_layer
        self.eval()
        
        self.z = torch.load(args.MODEL_NAME + '.pth', map_location='cuda')
        z = self.z
        z['emb.weight'] = F.layer_norm(z['emb.weight'], (args.n_embd,), weight=z['blocks.0.ln0.weight'], bias=z['blocks.0.ln0.bias'])
        
        keys = list(z.keys())
        for k in keys:
            if             '.time_' in k: z[k] = z[k].squeeze()
            if k.endswith('.time_decay'): z[k] = z[k].float()
            if k.endswith('.time_faaaa'): z[k] = z[k].unsqueeze(-1).float()
        for k in keys:
            if k.endswith('maa_w'):
                z[k.replace('maa_w','maa_wkvrg')] = torch.concat([z[k],z[k.replace('maa_w','maa_k')],z[k.replace('maa_w','maa_v')],z[k.replace('maa_w','maa_r')],z[k.replace('maa_w','maa_g')]]).clone().reshape(5, -1)
                del z[k]
                del z[k.replace('maa_w','maa_k')]
                del z[k.replace('maa_w','maa_v')]
                del z[k.replace('maa_w','maa_r')]
                del z[k.replace('maa_w','maa_g')]
                
        self.n_head = z['blocks.0.att.time_faaaa'].shape[0]
        self.head_size = z['blocks.0.ln1.weight'].shape[0] // self.n_head
        assert self.head_size == args.head_size

    @MyFunction
    def forward(self, token:int, state:List[torch.Tensor]):
        with torch.no_grad(): 
            z = self.z
            x = z['emb.weight'][token]
            for i in range(self.n_layer):
                bbb = f'blocks.{i}.'
                att = f'blocks.{i}.att.'
                ffn = f'blocks.{i}.ffn.'

                xx = F.layer_norm(x, (self.n_embd,), weight=z[bbb+'ln1.weight'], bias=z[bbb+'ln1.bias'])
                xx, state[i*3+0], state[i*3+1] = time_mixing(self.n_head, self.head_size, xx, state[i*3+0], state[i*3+1],
                    z[att+'time_maa_x'], z[att+'time_maa_wkvrg'], z[att+'time_maa_w1'], z[att+'time_maa_w2'],
                    z[att+'time_decay_w1'], z[att+'time_decay_w2'], z[att+'time_faaaa'], z[att+'time_decay'],
                    z[att+'key.weight'], z[att+'value.weight'], z[att+'receptance.weight'], z[att+'gate.weight'], z[att+'output.weight'],
                    z[att+'ln_x.weight'], z[att+'ln_x.bias'])
                x = x + xx

                xx = F.layer_norm(x, (self.n_embd,), weight=z[bbb+'ln2.weight'], bias=z[bbb+'ln2.bias'])
                xx, state[i*3+2] = channel_mixing(xx, state[i*3+2],
                    z[ffn+'time_maa_k'], z[ffn+'time_maa_r'], 
                    z[ffn+'key.weight'], z[ffn+'value.weight'], z[ffn+'receptance.weight'])
                x = x + xx
            
            x = F.layer_norm(x, (self.n_embd,), weight=z['ln_out.weight'], bias=z['ln_out.bias'])
            x = z['head.weight'] @ x
            return x, state

########################################################################################################

def time_mixing__(H:int, N:int, x, x_prev, state, maa_x, maa_wkvrg, tm_w1, tm_w2, td_w1, td_w2, time_faaaa, time_decay, kw, vw, rw, gw, ow, ln_w, ln_b):
    sx = x_prev - x
    xxx = x + sx * maa_x                            # C
    xxx = torch.tanh(xxx @ tm_w1).view(5, 1, -1)    # C @ C*5L => 5L => 5*1*L
    xxx = torch.bmm(xxx, tm_w2).view(5, -1)         # 5*1*L @ 5*L*C => 5*1*C => 5*C
    xxx = xxx + maa_wkvrg
    xxx = xxx * sx.expand(5, -1) + x.expand(5, -1)
    w, k, v, r, g = xxx.unbind(dim=0)

    w = torch.tanh(w @ td_w1) @ td_w2
    w = w.float() + time_decay
    # assert w.dtype == torch.float
    w = torch.exp(-torch.exp(w))

    k = (kw @ k).view(H, N, 1)
    v = (vw @ v).view(H, 1, N)
    r = (rw @ r).view(H, 1, N)
    g = torch.nn.functional.silu(gw @ g)

    kv = (k @ v).float()
    out = r @ (time_faaaa * kv + state).to(torch.bfloat16)

    state = kv + w.view(H, N, 1) * state

    out = torch.nn.functional.group_norm(out.view(1, H*N), num_groups=H, weight=ln_w, bias=ln_b, eps = 64e-5).view(H*N) # same as gn(x/8, eps=1e-5)
    return ow @ (out * g), x, state
try:
    time_mixing = torch.compile(time_mixing__, mode="max-autotune", fullgraph=True, dynamic=False)
except:
    time_mixing = torch.jit.script(time_mixing__)

########################################################################################################

def channel_mixing__(x, x_prev, time_maa_k, time_maa_r, kw, vw, rw):
    sx = x_prev - x
    k = x + sx * time_maa_k
    r = x + sx * time_maa_r

    r = torch.sigmoid(rw @ r)
    k = torch.relu(kw @ k) ** 2

    return r * (vw @ k), x
try:
    channel_mixing = torch.compile(channel_mixing__, mode="max-autotune", fullgraph=True, dynamic=False)
except:
    channel_mixing = torch.jit.script(channel_mixing__)

########################################################################################################

print(f'\nUsing CUDA bf16. Loading {args.MODEL_NAME} ...')
model = RWKV_RNN(args)

TEMPERATURE = 1.5
NUM_TRIALS = 1
LENGTH_PER_TRIAL = 2048
MIN_P = 0.1
EOS_DECAY_DIV = 25
EOS_IGNORE = 25

print(f'\nPreprocessing context (slow version. see v2/rwkv/model.py for fast version)')

global_init_state = [None for _ in range(args.n_layer * 3)]
for i in range(args.n_layer): # state: 0=att_x_prev 1=att_kv 2=ffn_x_prev
    global_init_state[i*3+0] = torch.zeros(args.n_embd, dtype=torch.bfloat16, requires_grad=False, device="cuda")
    global_init_state[i*3+1] = torch.zeros((args.n_embd // args.head_size, args.head_size, args.head_size), dtype=torch.float, requires_grad=False, device="cuda")
    global_init_state[i*3+2] = torch.zeros(args.n_embd, dtype=torch.bfloat16, requires_grad=False, device="cuda")

def generate_text(context_str):
    context_tokens = tokenizer.encode(context_str).ids
    state = global_init_state.copy()
    for token in context_tokens:
        out, state = model.forward(token, state)

    ########################################################################################################

    all_tokens = []
    all_tokens += context_tokens

    added_tokens_count = 0
    for i in range(LENGTH_PER_TRIAL):
        added_tokens_count+=1

        # applying temperature param
        scores = out / TEMPERATURE

        min_p = MIN_P

        # convert logits to probabilities for min_p filtering
        probs = torch.softmax(scores.float(),dim=-1)

        # decaying eos token to induce longer generation
        if (added_tokens_count <= EOS_IGNORE):
            decay_factor = 0
        else:
            decay_factor = 1.0 - math.exp(-((added_tokens_count-EOS_IGNORE)/EOS_DECAY_DIV))
        probs[0] *= decay_factor

        # Get the probability of the top token for each sequence in the batch
        max_prob, _ = (probs).max(dim=-1, keepdim=True)
        # Calculate the actual min_p threshold by scaling min_p with the top token's probability
        scaled_min_p = min_p * max_prob
        # Create a mask for tokens that have a probability less than the scaled min_p
        indices_to_remove = probs < scaled_min_p

        scores.masked_fill_(indices_to_remove, float("-Inf"))
        #print(scores[scores != float("-Inf")])

        # convert min_p filtered logits to probabilities
        probs = torch.softmax(scores.float(),dim=-1)
        # reapply eos token decay to new probabilities
        probs[0] *= decay_factor

        token = torch.multinomial(probs, num_samples=1).squeeze(dim=-1)

        #print(token)

        all_tokens += [token]
        if len(all_tokens) > 3:
            # if the last 3 tokens are all the same break
            if all_tokens[-1] == all_tokens[-2] == all_tokens[-3]: break

        if token == 0: break

        out, state = model.forward(token, state)

        torch.cuda.synchronize()
    
    # we generated all the tokens we could
    output_string = tokenizer.decode(all_tokens)
    logger.info(f"Generated {len(all_tokens)} = ctx:{len(context_tokens)} + gen:{added_tokens_count} tokens, Result:{output_string}")

    # insert bolding of the input string TODO: wonky
    #output_string = output_string[:len(context_str)] + "**" + output_string[len(context_str):]

    return output_string

import logging

from telegram import ForceReply, Update
from telegram.ext import Application, CommandHandler, ContextTypes, MessageHandler, filters
# Read telegram token from .telegram_token
with open(".telegram_token") as f:
    TOKEN = f.read().strip()


# Enable logging
logging.basicConfig(
    filename='tamilm.log',
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
)
# set higher logging level for httpx to avoid all GET and POST requests being logged
logging.getLogger("httpx").setLevel(logging.WARNING)

logger = logging.getLogger(__name__)


# Define a few command handlers. These usually take the two arguments update and
# context.
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    """Send a message when the command /start is issued."""
    await update.message.reply_text("use /complete to complete your hebrew text in groupchats")


async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    """Send a message when the command /help is issued."""
    await update.message.reply_text("use /complete to complete your hebrew text")

async def complete_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    """Send a message when the command /help is issued."""
    message_text = ' '.join(context.args)

    # if this is a reply to a message include it in the prompt
    if update.message.reply_to_message != None:
        message_text = update.message.reply_to_message.text + ' ' + message_text

    generated_responce = generate_text(message_text)
    #logger.info(generated_responce)
    await update.message.reply_text(generated_responce)


async def echo(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    message_text = update.message.text
    if update.message.chat.type == "private":
        generated_responce = generate_text(message_text)
        #(generated_responce)
        await update.message.reply_text(generated_responce)
    """Echo the user message."""
    #await update.message.reply_text(message_text)


def main() -> None:
    """Start the bot."""
    # Create the Application and pass it your bot's token.
    application = Application.builder().token(TOKEN).build()

    # on different commands - answer in Telegram
    application.add_handler(CommandHandler("start", start))
    application.add_handler(CommandHandler("help", help_command))
    application.add_handler(CommandHandler("complete", complete_command))

    # on non command i.e message - echo the message on Telegram
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, echo))

    print("BOT READY")

    # Run the bot until the user presses Ctrl-C
    application.run_polling(allowed_updates=Update.ALL_TYPES)

if __name__ == "__main__":
    main()
Initial commit 2024-10-24 16:32:26 +03:00			`#!/usr/bin/env python`
			`# pylint: disable=unused-argument`
			`# This program is dedicated to the public domain under the CC0 license.`

			`"""`
			`Simple Bot to reply to Telegram messages.`

			`First, a few handler functions are defined. Then, those functions are passed to`
			`the Application and registered at their respective places.`
			`Then, the bot is started and runs until we press Ctrl-C on the command line.`

			`Usage:`
			`Basic Echobot example, repeats messages.`
			`Press Ctrl-C on the command line or send a signal to the process to stop the`
			`bot.`
			`"""`

			`import math`
			`########################################################################################################`
			`# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM`
			`########################################################################################################`

			`import numpy as np`
			`np.set_printoptions(precision=4, suppress=True, linewidth=200)`
			`import types, torch, copy, time`
			`from typing import List`
			`torch.backends.cudnn.benchmark = True`
			`torch.backends.cudnn.allow_tf32 = True`
			`torch.backends.cuda.matmul.allow_tf32 = True`
			`# torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True`
			`# torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True`
			`torch._C._jit_set_autocast_mode(False)`

			`import torch.nn as nn`
			`from torch.nn import functional as F`

			`MyModule = torch.jit.ScriptModule`
			`MyFunction = torch.jit.script_method`
			`MyStatic = torch.jit.script`

			`########################################################################################################`

			`from tokenizers import Tokenizer`
			`from tokenizers import decoders`
			`tokenizer = Tokenizer.from_file("tokenizer_vocab/tokenizer-tami-wordpiece-hedc4.json")`
			`tokenizer.decoder = decoders.WordPiece()`

			`args = types.SimpleNamespace()`
			`args.MODEL_NAME = 'tami-models/rwkv-final-maybe-knesset'`
			`args.n_layer = 12`
			`args.n_embd = 512`
			`args.vocab_size = 65536`
			`args.head_size = 64`

			`#context = "מחפשת בדחיפות מחשב נייד הכי פשוט עם מטען"`


			`########################################################################################################`

			`class RWKV_RNN(MyModule):`
			`def __init__(self, args):`
			`super().__init__()`
			`self.args = args`
			`self.n_embd = args.n_embd`
			`self.n_layer = args.n_layer`
			`self.eval()`

			`self.z = torch.load(args.MODEL_NAME + '.pth', map_location='cuda')`
			`z = self.z`
			`z['emb.weight'] = F.layer_norm(z['emb.weight'], (args.n_embd,), weight=z['blocks.0.ln0.weight'], bias=z['blocks.0.ln0.bias'])`

			`keys = list(z.keys())`
			`for k in keys:`
			`if '.time_' in k: z[k] = z[k].squeeze()`
			`if k.endswith('.time_decay'): z[k] = z[k].float()`
			`if k.endswith('.time_faaaa'): z[k] = z[k].unsqueeze(-1).float()`
			`for k in keys:`
			`if k.endswith('maa_w'):`
			`z[k.replace('maa_w','maa_wkvrg')] = torch.concat([z[k],z[k.replace('maa_w','maa_k')],z[k.replace('maa_w','maa_v')],z[k.replace('maa_w','maa_r')],z[k.replace('maa_w','maa_g')]]).clone().reshape(5, -1)`
			`del z[k]`
			`del z[k.replace('maa_w','maa_k')]`
			`del z[k.replace('maa_w','maa_v')]`
			`del z[k.replace('maa_w','maa_r')]`
			`del z[k.replace('maa_w','maa_g')]`

			`self.n_head = z['blocks.0.att.time_faaaa'].shape[0]`
			`self.head_size = z['blocks.0.ln1.weight'].shape[0] // self.n_head`
			`assert self.head_size == args.head_size`

			`@MyFunction`
			`def forward(self, token:int, state:List[torch.Tensor]):`
			`with torch.no_grad():`
			`z = self.z`
			`x = z['emb.weight'][token]`
			`for i in range(self.n_layer):`
			`bbb = f'blocks.{i}.'`
			`att = f'blocks.{i}.att.'`
			`ffn = f'blocks.{i}.ffn.'`

			`xx = F.layer_norm(x, (self.n_embd,), weight=z[bbb+'ln1.weight'], bias=z[bbb+'ln1.bias'])`
			`xx, state[i3+0], state[i3+1] = time_mixing(self.n_head, self.head_size, xx, state[i3+0], state[i3+1],`
			`z[att+'time_maa_x'], z[att+'time_maa_wkvrg'], z[att+'time_maa_w1'], z[att+'time_maa_w2'],`
			`z[att+'time_decay_w1'], z[att+'time_decay_w2'], z[att+'time_faaaa'], z[att+'time_decay'],`
			`z[att+'key.weight'], z[att+'value.weight'], z[att+'receptance.weight'], z[att+'gate.weight'], z[att+'output.weight'],`
			`z[att+'ln_x.weight'], z[att+'ln_x.bias'])`
			`x = x + xx`

			`xx = F.layer_norm(x, (self.n_embd,), weight=z[bbb+'ln2.weight'], bias=z[bbb+'ln2.bias'])`
			`xx, state[i3+2] = channel_mixing(xx, state[i3+2],`
			`z[ffn+'time_maa_k'], z[ffn+'time_maa_r'],`
			`z[ffn+'key.weight'], z[ffn+'value.weight'], z[ffn+'receptance.weight'])`
			`x = x + xx`

			`x = F.layer_norm(x, (self.n_embd,), weight=z['ln_out.weight'], bias=z['ln_out.bias'])`
			`x = z['head.weight'] @ x`
			`return x, state`

			`########################################################################################################`

			`def time_mixing__(H:int, N:int, x, x_prev, state, maa_x, maa_wkvrg, tm_w1, tm_w2, td_w1, td_w2, time_faaaa, time_decay, kw, vw, rw, gw, ow, ln_w, ln_b):`
			`sx = x_prev - x`
			`xxx = x + sx * maa_x # C`
			`xxx = torch.tanh(xxx @ tm_w1).view(5, 1, -1) # C @ C5L => 5L => 51*L`
			`xxx = torch.bmm(xxx, tm_w2).view(5, -1) # 51L @ 5LC => 51C => 5*C`
			`xxx = xxx + maa_wkvrg`
			`xxx = xxx * sx.expand(5, -1) + x.expand(5, -1)`
			`w, k, v, r, g = xxx.unbind(dim=0)`

			`w = torch.tanh(w @ td_w1) @ td_w2`
			`w = w.float() + time_decay`
			`# assert w.dtype == torch.float`
			`w = torch.exp(-torch.exp(w))`

			`k = (kw @ k).view(H, N, 1)`
			`v = (vw @ v).view(H, 1, N)`
			`r = (rw @ r).view(H, 1, N)`
			`g = torch.nn.functional.silu(gw @ g)`

			`kv = (k @ v).float()`
			`out = r @ (time_faaaa * kv + state).to(torch.bfloat16)`

			`state = kv + w.view(H, N, 1) * state`

			`out = torch.nn.functional.group_norm(out.view(1, HN), num_groups=H, weight=ln_w, bias=ln_b, eps = 64e-5).view(HN) # same as gn(x/8, eps=1e-5)`
			`return ow @ (out * g), x, state`
			`try:`
			`time_mixing = torch.compile(time_mixing__, mode="max-autotune", fullgraph=True, dynamic=False)`
			`except:`
			`time_mixing = torch.jit.script(time_mixing__)`

			`########################################################################################################`

			`def channel_mixing__(x, x_prev, time_maa_k, time_maa_r, kw, vw, rw):`
			`sx = x_prev - x`
			`k = x + sx * time_maa_k`
			`r = x + sx * time_maa_r`

			`r = torch.sigmoid(rw @ r)`
			`k = torch.relu(kw @ k) ** 2`

			`return r * (vw @ k), x`
			`try:`
			`channel_mixing = torch.compile(channel_mixing__, mode="max-autotune", fullgraph=True, dynamic=False)`
			`except:`
			`channel_mixing = torch.jit.script(channel_mixing__)`

			`########################################################################################################`

			`print(f'\nUsing CUDA bf16. Loading {args.MODEL_NAME} ...')`
			`model = RWKV_RNN(args)`

			`TEMPERATURE = 1.5`
			`NUM_TRIALS = 1`
			`LENGTH_PER_TRIAL = 2048`
			`MIN_P = 0.1`
			`EOS_DECAY_DIV = 25`
			`EOS_IGNORE = 25`

			`print(f'\nPreprocessing context (slow version. see v2/rwkv/model.py for fast version)')`

			`global_init_state = [None for _ in range(args.n_layer * 3)]`
			`for i in range(args.n_layer): # state: 0=att_x_prev 1=att_kv 2=ffn_x_prev`
			`global_init_state[i*3+0] = torch.zeros(args.n_embd, dtype=torch.bfloat16, requires_grad=False, device="cuda")`
			`global_init_state[i*3+1] = torch.zeros((args.n_embd // args.head_size, args.head_size, args.head_size), dtype=torch.float, requires_grad=False, device="cuda")`
			`global_init_state[i*3+2] = torch.zeros(args.n_embd, dtype=torch.bfloat16, requires_grad=False, device="cuda")`

			`def generate_text(context_str):`
			`context_tokens = tokenizer.encode(context_str).ids`
			`state = global_init_state.copy()`
			`for token in context_tokens:`
			`out, state = model.forward(token, state)`

			`########################################################################################################`

			`all_tokens = []`
			`all_tokens += context_tokens`

			`added_tokens_count = 0`
			`for i in range(LENGTH_PER_TRIAL):`
			`added_tokens_count+=1`

			`# applying temperature param`
			`scores = out / TEMPERATURE`

			`min_p = MIN_P`

			`# convert logits to probabilities for min_p filtering`
			`probs = torch.softmax(scores.float(),dim=-1)`

			`# decaying eos token to induce longer generation`
			`if (added_tokens_count <= EOS_IGNORE):`
			`decay_factor = 0`
			`else:`
			`decay_factor = 1.0 - math.exp(-((added_tokens_count-EOS_IGNORE)/EOS_DECAY_DIV))`
			`probs[0] *= decay_factor`

			`# Get the probability of the top token for each sequence in the batch`
			`max_prob, _ = (probs).max(dim=-1, keepdim=True)`
			`# Calculate the actual min_p threshold by scaling min_p with the top token's probability`
			`scaled_min_p = min_p * max_prob`
			`# Create a mask for tokens that have a probability less than the scaled min_p`
			`indices_to_remove = probs < scaled_min_p`

			`scores.masked_fill_(indices_to_remove, float("-Inf"))`
			`#print(scores[scores != float("-Inf")])`

			`# convert min_p filtered logits to probabilities`
			`probs = torch.softmax(scores.float(),dim=-1)`
			`# reapply eos token decay to new probabilities`
			`probs[0] *= decay_factor`

			`token = torch.multinomial(probs, num_samples=1).squeeze(dim=-1)`

			`#print(token)`

			`all_tokens += [token]`
			`if len(all_tokens) > 3:`
			`# if the last 3 tokens are all the same break`
			`if all_tokens[-1] == all_tokens[-2] == all_tokens[-3]: break`

			`if token == 0: break`

			`out, state = model.forward(token, state)`

			`torch.cuda.synchronize()`

			`# we generated all the tokens we could`
			`output_string = tokenizer.decode(all_tokens)`
			`logger.info(f"Generated {len(all_tokens)} = ctx:{len(context_tokens)} + gen:{added_tokens_count} tokens, Result:{output_string}")`

			`# insert bolding of the input string TODO: wonky`
			`#output_string = output_string[:len(context_str)] + "**" + output_string[len(context_str):]`

			`return output_string`

			`import logging`

			`from telegram import ForceReply, Update`
			`from telegram.ext import Application, CommandHandler, ContextTypes, MessageHandler, filters`
			`# Read telegram token from .telegram_token`
			`with open(".telegram_token") as f:`
			`TOKEN = f.read().strip()`


			`# Enable logging`
			`logging.basicConfig(`
			`filename='tamilm.log',`
			`format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO`
			`)`
			`# set higher logging level for httpx to avoid all GET and POST requests being logged`
			`logging.getLogger("httpx").setLevel(logging.WARNING)`

			`logger = logging.getLogger(__name__)`


			`# Define a few command handlers. These usually take the two arguments update and`
			`# context.`
			`async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:`
			`"""Send a message when the command /start is issued."""`
			`await update.message.reply_text("use /complete to complete your hebrew text in groupchats")`


			`async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:`
			`"""Send a message when the command /help is issued."""`
			`await update.message.reply_text("use /complete to complete your hebrew text")`

			`async def complete_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:`
			`"""Send a message when the command /help is issued."""`
			`message_text = ' '.join(context.args)`

			`# if this is a reply to a message include it in the prompt`
			`if update.message.reply_to_message != None:`
			`message_text = update.message.reply_to_message.text + ' ' + message_text`

			`generated_responce = generate_text(message_text)`
			`#logger.info(generated_responce)`
			`await update.message.reply_text(generated_responce)`


			`async def echo(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:`
			`message_text = update.message.text`
			`if update.message.chat.type == "private":`
			`generated_responce = generate_text(message_text)`
			`#(generated_responce)`
			`await update.message.reply_text(generated_responce)`
			`"""Echo the user message."""`
			`#await update.message.reply_text(message_text)`


			`def main() -> None:`
			`"""Start the bot."""`
			`# Create the Application and pass it your bot's token.`
			`application = Application.builder().token(TOKEN).build()`

			`# on different commands - answer in Telegram`
			`application.add_handler(CommandHandler("start", start))`
			`application.add_handler(CommandHandler("help", help_command))`
			`application.add_handler(CommandHandler("complete", complete_command))`

			`# on non command i.e message - echo the message on Telegram`
			`application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, echo))`

			`print("BOT READY")`

			`# Run the bot until the user presses Ctrl-C`
			`application.run_polling(allowed_updates=Update.ALL_TYPES)`

			`if __name__ == "__main__":`
			`main()`