vision-career/vacancies/main/management/commands/collect_vacancies_from_hh_parser.py
estromenko af7dbf7246
All checks were successful
release / docker (push) Successful in 22s
Add missing company field to external vacancy model
2025-12-02 20:39:01 +03:00

46 lines
1.8 KiB
Python

from django.core.management import BaseCommand
from django.utils import timezone
from vacancies.hh_parser.models import Vacancy as ExternalVacancy
from vacancies.main.models import Vacancy, JobTitle
from flashrank import Ranker, RerankRequest
import re
tags_regex = re.compile('<.*?>')
reranker = Ranker("ms-marco-TinyBERT-L-2-v2")
class Command(BaseCommand):
help = "Collect vacancies from hh.ru parser"
def _remove_tags(self, text):
return re.sub(tags_regex, "", text)
def handle(self, *args, **options):
job_titles = list(JobTitle.objects.values("id", "title"))
passages = [{"text": job_title["title"], **job_title} for job_title in job_titles]
queryset = ExternalVacancy.objects.filter(title__isnull=False, description__isnull=False)
total_vacancies = queryset.count()
for index, vacancy in enumerate(queryset):
results = reranker.rerank(RerankRequest(query=vacancy.title, passages=passages))
ordered_results = sorted(results, key=lambda i: i["score"], reverse=True)
job_title_id = ordered_results[0]["id"]
vacancy, created = Vacancy.objects.get_or_create(
external_id=vacancy.id,
defaults=dict(
job_title_id=job_title_id,
min_salary_rub=vacancy.min_payment,
max_salary_rub=vacancy.max_payment,
company_name=vacancy.company,
requirements=self._remove_tags(vacancy.description),
content=self._remove_tags(vacancy.description),
timestamp=timezone.make_aware(vacancy.created_at),
link=vacancy.link,
),
)
print(f"{index+1}/{total_vacancies} Vacancy: {vacancy}, created: {created}")