Spaces:

d0r1h
/

Hindi_News_Summarizer

Sleeping

Hindi_News_Summarizer / summarizer.py

Updated bart model

d7989d6 over 3 years ago

1.35 kB

	import re
	from extractdata import extract_text
	from wordcloudplot import plot_wordcloud
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	def summarize(input_, model):
	if input_.split("/")[0] == "https:":
	text = extract_text(input_)
	else:
	text = input_

	if model == "T5":
	checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
	elif model == "BART":
	checkpoint = "ai4bharat/IndicBART-XLSum"

	WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

	tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


	input_ids = tokenizer(
	[WHITESPACE_HANDLER(text)],
	return_tensors="pt",
	padding="max_length",
	truncation=True,
	max_length=512 )["input_ids"]

	output_ids = model.generate(
	input_ids=input_ids,
	max_length=70,
	min_length=30,
	no_repeat_ngram_size=2,
	num_beams=4 )[0]


	summary = tokenizer.decode(
	output_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False)

	figure = plot_wordcloud(text)

	return summary, figure