import re from typing import Literal from django.core.management import BaseCommand from django.utils import timezone from flashrank import Ranker, RerankRequest from langchain_openai import ChatOpenAI from pydantic import BaseModel from vacancies.hh_parser.models import Vacancy as ExternalVacancy from vacancies.main import prompts from vacancies.main.models import JobTitle, Vacancy tags_regex = re.compile('<.*?>') reranker = Ranker("ms-marco-TinyBERT-L-2-v2") class Command(BaseCommand): help = "Collect vacancies from hh.ru parser" def _remove_tags(self, text): return re.sub(tags_regex, "", text) def handle(self, *args, **options): job_titles = list(JobTitle.objects.values("id", "title")) passages = [{"text": job_title["title"], **job_title} for job_title in job_titles] queryset = ExternalVacancy.objects.filter(title__isnull=False, description__isnull=False) total_vacancies = queryset.count() # job_titles = JobTitle.objects.values_list('title', flat=True) class Structure(BaseModel): # job_title: Literal[tuple(job_titles)] # original_title: str # min_salary_rub: int | None # max_salary_rub: int | None # company_name: str requirements: str openai_client = ChatOpenAI( model_name="openai/gpt-5-mini", openai_api_base="https://openrouter.ai/api/v1", temperature=0, seed=42, top_p=1, ) structured_llm = openai_client.with_structured_output(Structure) prompt = prompts.STRUCTURED_OUTPUT_PROMPT response = structured_llm.invoke(prompt) for index, vacancy in enumerate(queryset): results = reranker.rerank(RerankRequest(query=vacancy.title, passages=passages)) ordered_results = sorted(results, key=lambda i: i["score"], reverse=True) job_title_id = ordered_results[0]["id"] vacancy, created = Vacancy.objects.get_or_create( external_id=vacancy.id, defaults=dict( job_title_id=job_title_id, original_title=vacancy.title, min_salary_rub=vacancy.min_payment, max_salary_rub=vacancy.max_payment, company_name=vacancy.company, requirements=response.requirements, content=self._remove_tags(vacancy.description), timestamp=timezone.make_aware(vacancy.created_at), link=vacancy.link, ), ) print(f"{index+1}/{total_vacancies} Vacancy: {vacancy}, created: {created}")