From fb95e6e7999e515d9155950d2078e20dfe76d095 Mon Sep 17 00:00:00 2001 From: estromenko Date: Sun, 26 Oct 2025 18:06:45 +0300 Subject: [PATCH] Simplify vacancy recommendations --- README.md | 1 - vacancies/main/features_extractor.py | 29 --------------- ...ollect_vacancies_from_telegram_messages.py | 8 ++--- .../generate_recommended_vacancies.py | 11 ++---- .../management/commands/process_vacancies.py | 20 ----------- .../0005_remove_vacancy_is_processed.py | 17 +++++++++ vacancies/main/models.py | 1 - vacancies/main/vector_store.py | 35 +++++++++++++++---- 8 files changed, 53 insertions(+), 69 deletions(-) delete mode 100644 vacancies/main/features_extractor.py delete mode 100644 vacancies/main/management/commands/process_vacancies.py create mode 100644 vacancies/main/migrations/0005_remove_vacancy_is_processed.py diff --git a/README.md b/README.md index 10b19c0..b54906d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ uv run --env-file .env manage.py migrate uv run --env-file .env manage.py createsuperuser --username stromenko_es --email estromenko@mail.ru uv run --env-file .env manage.py runserver -uv run --env-file .env manage.py process_vacancies uv run --env-file .env manage.py generate_recommended_vacancies uv run --env-file .env manage.py collect_vacancies_from_telegram_messages uv run --env-file .env manage.py runbot diff --git a/vacancies/main/features_extractor.py b/vacancies/main/features_extractor.py deleted file mode 100644 index 656f8c9..0000000 --- a/vacancies/main/features_extractor.py +++ /dev/null @@ -1,29 +0,0 @@ -from langchain_openai import ChatOpenAI -from vacancies.main.models import VacancyFeatures - - -def extract_vacancy_features(content: str) -> VacancyFeatures: - """Extract features from vacancy content using structured output.""" - - prompt = f""" - Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. - Features: - - employment_type: Тип занятости (e.g., Полная занятость, Частичная) - - work_format: Формат работы (e.g., Офис, Удалённо, Гибрид) - - experience: Опыт работы (e.g., 3-5 лет, Нет опыта) - - position_level: Уровень позиции (e.g., Junior, Senior) - - industry: Отрасль / Сфера деятельности (e.g., IT, Финансы) - - tech_stack: Технологический стек / Ключевые навыки (list of strings) - - location: География (e.g., Москва, Россия) - - salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб) - - languages: Языки (list of strings, e.g., ["Русский", "Английский"]) - - education: Образование (e.g., Высшее, Среднее специальное) - - schedule: График работы (e.g., Полный день, Сменный) - - additional_requirements: Дополнительные предпочтения / требования (list of strings) - Vacancy content: - {content} - """ - openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal") - structured_llm = openai_client.with_structured_output(VacancyFeatures) - response = structured_llm.invoke(prompt) - return response diff --git a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py index b0a0282..88ff7e0 100644 --- a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py +++ b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py @@ -1,8 +1,7 @@ from django.core.management import BaseCommand from vacancies.main.models import Vacancy import clickhouse_connect -from vacancies.main.vector_store import add_vacancy_vectors -from vacancies.main.features_extractor import extract_vacancy_features +from vacancies.main.vector_store import add_vacancy_vectors, extract_vacancy_features clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123) @@ -29,10 +28,11 @@ class Command(BaseCommand): (id, chat_username, telegram_id, message, timestamp) = row link = f"https://t.me/{chat_username}/{telegram_id}" + features = extract_vacancy_features(message) vacancy, created = Vacancy.objects.get_or_create( link=link, - defaults={'content': message} + defaults={'content': message, 'features_json': features.model_dump()} ) print(index, link) - add_vacancy_vectors(vacancy.id, extract_vacancy_features(message).model_dump(), {"link": link}) + add_vacancy_vectors(vacancy.id, features.model_dump(), {"link": link}) diff --git a/vacancies/main/management/commands/generate_recommended_vacancies.py b/vacancies/main/management/commands/generate_recommended_vacancies.py index c415c04..8cc75be 100644 --- a/vacancies/main/management/commands/generate_recommended_vacancies.py +++ b/vacancies/main/management/commands/generate_recommended_vacancies.py @@ -15,19 +15,14 @@ class Command(BaseCommand): for customer_cv in customer_cvs: features = extract_vacancy_features(customer_cv.content) - print(features) recommended_vacancy_ids = RecommendedVacancy.objects.filter( - customer=customer_cv.customer + customer=customer_cv.customer, ).values_list('vacancy_id', flat=True) - query_filter = Filter( - must_not = [ - HasIdCondition(has_id=recommended_vacancy_ids), - ] - ) + query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)]) search_result_id = search_similarities(features.model_dump(), query_filter) - recommendation, _ = RecommendedVacancy.objects.get_or_create( + recommendation = RecommendedVacancy.objects.create( customer=customer_cv.customer, vacancy_id=search_result_id, ) diff --git a/vacancies/main/management/commands/process_vacancies.py b/vacancies/main/management/commands/process_vacancies.py deleted file mode 100644 index d6b463f..0000000 --- a/vacancies/main/management/commands/process_vacancies.py +++ /dev/null @@ -1,20 +0,0 @@ -from django.core.management import BaseCommand -from vacancies.main.models import Vacancy -from vacancies.main.features_extractor import extract_vacancy_features -from vacancies.main.vector_store import add_vacancy_vectors - - -class Command(BaseCommand): - help = "Process vacancies: extract features and index in vector store" - - def handle(self, *args, **options): - vacancies = Vacancy.objects.filter(is_processed=False) - - len_vacancies = len(vacancies) - for index, vacancy in enumerate(vacancies): - print(f"Processing {index}/{len_vacancies} {vacancy}") - features = extract_vacancy_features(vacancy.content) - vacancy.features_json = features.model_dump() - vacancy.is_processed = True - vacancy.save() - add_vacancy_vectors(vacancy.id, features.model_dump(), {"link": vacancy.link}) diff --git a/vacancies/main/migrations/0005_remove_vacancy_is_processed.py b/vacancies/main/migrations/0005_remove_vacancy_is_processed.py new file mode 100644 index 0000000..aa2b3e1 --- /dev/null +++ b/vacancies/main/migrations/0005_remove_vacancy_is_processed.py @@ -0,0 +1,17 @@ +# Generated by Django 5.2.7 on 2025-10-26 15:06 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0004_vacancy_features_json_vacancy_is_processed'), + ] + + operations = [ + migrations.RemoveField( + model_name='vacancy', + name='is_processed', + ), + ] diff --git a/vacancies/main/models.py b/vacancies/main/models.py index 687fea3..b1bde3c 100644 --- a/vacancies/main/models.py +++ b/vacancies/main/models.py @@ -35,7 +35,6 @@ class Vacancy(models.Model): content = models.TextField() link = models.URLField(unique=True) features_json = models.JSONField(null=True, blank=True) - is_processed = models.BooleanField(default=False) created_at = models.DateTimeField(auto_now_add=True) objects = models.Manager() diff --git a/vacancies/main/vector_store.py b/vacancies/main/vector_store.py index 90a6acf..f28df2d 100644 --- a/vacancies/main/vector_store.py +++ b/vacancies/main/vector_store.py @@ -1,10 +1,12 @@ from qdrant_client import models from langchain_openai import OpenAIEmbeddings +from langchain_openai import ChatOpenAI from qdrant_client import QdrantClient from qdrant_client.models import Filter +from vacancies.main.models import VacancyFeatures -# client = QdrantClient(path="./embeddings") -client = QdrantClient(url="http://localhost:6333") +client = QdrantClient(path="./embeddings") +#client = QdrantClient(url="http://localhost:6333") FEATURE_NAMES = [ "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack", @@ -59,10 +61,6 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict): def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]: - """Search vacancies using sum of max similarities. - For each feature, compute similarities, then for each vacancy, take max per feature, sum. - Return top vacancies. - """ texts = _prepare_texts(query_features) vectors = {} for name, text in texts.items(): @@ -95,3 +93,28 @@ def search_similarities(query_features: dict, query_filter: Filter) -> list[dict scored.sort(key=lambda x: x["score"], reverse=True) return scored[0]["id"] + + +def extract_vacancy_features(content: str) -> VacancyFeatures: + prompt = f""" + Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. + Features: + - employment_type: Тип занятости (e.g., Полная занятость, Частичная) + - work_format: Формат работы (e.g., Офис, Удалённо, Гибрид) + - experience: Опыт работы (e.g., 3-5 лет, Нет опыта) + - position_level: Уровень позиции (e.g., Junior, Senior) + - industry: Отрасль / Сфера деятельности (e.g., IT, Финансы) + - tech_stack: Технологический стек / Ключевые навыки (list of strings) + - location: География (e.g., Москва, Россия) + - salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб) + - languages: Языки (list of strings, e.g., ["Русский", "Английский"]) + - education: Образование (e.g., Высшее, Среднее специальное) + - schedule: График работы (e.g., Полный день, Сменный) + - additional_requirements: Дополнительные предпочтения / требования (list of strings) + Vacancy content: + {content} + """ + openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal") + structured_llm = openai_client.with_structured_output(VacancyFeatures) + response = structured_llm.invoke(prompt) + return response