73 lines
2.7 KiB
Python
73 lines
2.7 KiB
Python
import re
|
|
from typing import Literal
|
|
|
|
from django.core.management import BaseCommand
|
|
from django.utils import timezone
|
|
from flashrank import Ranker, RerankRequest
|
|
from langchain_openai import ChatOpenAI
|
|
from pydantic import BaseModel
|
|
|
|
from vacancies.hh_parser.models import Vacancy as ExternalVacancy
|
|
from vacancies.main import prompts
|
|
from vacancies.main.models import JobTitle, Vacancy
|
|
|
|
tags_regex = re.compile('<.*?>')
|
|
|
|
reranker = Ranker("ms-marco-TinyBERT-L-2-v2")
|
|
|
|
|
|
class Command(BaseCommand):
|
|
help = "Collect vacancies from hh.ru parser"
|
|
|
|
def _remove_tags(self, text):
|
|
return re.sub(tags_regex, "", text)
|
|
|
|
def handle(self, *args, **options):
|
|
job_titles = list(JobTitle.objects.values("id", "title"))
|
|
passages = [{"text": job_title["title"], **job_title} for job_title in job_titles]
|
|
|
|
queryset = ExternalVacancy.objects.filter(title__isnull=False, description__isnull=False)
|
|
total_vacancies = queryset.count()
|
|
|
|
# job_titles = JobTitle.objects.values_list('title', flat=True)
|
|
class Structure(BaseModel):
|
|
# job_title: Literal[tuple(job_titles)]
|
|
# original_title: str
|
|
# min_salary_rub: int | None
|
|
# max_salary_rub: int | None
|
|
# company_name: str
|
|
requirements: str
|
|
|
|
openai_client = ChatOpenAI(
|
|
model_name="openai/gpt-5-mini",
|
|
openai_api_base="https://openrouter.ai/api/v1",
|
|
temperature=0,
|
|
seed=42,
|
|
top_p=1,
|
|
)
|
|
structured_llm = openai_client.with_structured_output(Structure)
|
|
prompt = prompts.STRUCTURED_OUTPUT_PROMPT
|
|
response = structured_llm.invoke(prompt)
|
|
|
|
for index, vacancy in enumerate(queryset):
|
|
results = reranker.rerank(RerankRequest(query=vacancy.title, passages=passages))
|
|
ordered_results = sorted(results, key=lambda i: i["score"], reverse=True)
|
|
job_title_id = ordered_results[0]["id"]
|
|
|
|
vacancy, created = Vacancy.objects.get_or_create(
|
|
external_id=vacancy.id,
|
|
defaults=dict(
|
|
job_title_id=job_title_id,
|
|
original_title=vacancy.title,
|
|
min_salary_rub=vacancy.min_payment,
|
|
max_salary_rub=vacancy.max_payment,
|
|
company_name=vacancy.company,
|
|
requirements=response.requirements,
|
|
content=self._remove_tags(vacancy.description),
|
|
timestamp=timezone.make_aware(vacancy.created_at),
|
|
link=vacancy.link,
|
|
),
|
|
)
|
|
|
|
print(f"{index+1}/{total_vacancies} Vacancy: {vacancy}, created: {created}")
|