Fix vacancies recommendations weights
This commit is contained in:
parent
cf9f19a216
commit
9dceaaeccc
@ -5,23 +5,7 @@ from vacancies.main.vector_store import add_vectors, extract_features, client as
|
|||||||
|
|
||||||
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
|
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
|
||||||
|
|
||||||
next_page_offset = 0
|
query = """
|
||||||
exist_points_ids = []
|
|
||||||
while next_page_offset is not None:
|
|
||||||
response = qdrant.scroll(
|
|
||||||
collection_name="vacancies",
|
|
||||||
limit=100_000,
|
|
||||||
offset=next_page_offset,
|
|
||||||
with_payload=False,
|
|
||||||
with_vectors=False,
|
|
||||||
timeout=30,
|
|
||||||
)
|
|
||||||
exist_points_ids.extend([point.id for point in response[0]])
|
|
||||||
next_page_offset = response[1]
|
|
||||||
exist_points_set = tuple(exist_points_ids)
|
|
||||||
print("qdrant vacancies points count:", len(exist_points_set))
|
|
||||||
|
|
||||||
query = f"""
|
|
||||||
SELECT id, chat_username, telegram_id, message, timestamp
|
SELECT id, chat_username, telegram_id, message, timestamp
|
||||||
FROM telegram_parser_chatmessage
|
FROM telegram_parser_chatmessage
|
||||||
WHERE timestamp >= now() - INTERVAL 30 DAY
|
WHERE timestamp >= now() - INTERVAL 30 DAY
|
||||||
@ -33,7 +17,7 @@ WHERE timestamp >= now() - INTERVAL 30 DAY
|
|||||||
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
|
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
|
||||||
'резюме', 'собеседование', 'junior', 'middle', 'senior'
|
'резюме', 'собеседование', 'junior', 'middle', 'senior'
|
||||||
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
|
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
|
||||||
AND id NOT IN {exist_points_set}
|
AND id NOT IN %(exist_points)s
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -41,28 +25,36 @@ class Command(BaseCommand):
|
|||||||
help = "Collect vacancies from telegram messages"
|
help = "Collect vacancies from telegram messages"
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
import time
|
next_page_offset = 0
|
||||||
start_time = time.time()
|
exist_points_ids = [-1]
|
||||||
result_rows = clickhouse_client.query(query).result_rows
|
while next_page_offset is not None:
|
||||||
print(f"query time: {time.time() - start_time:.4f}")
|
response = qdrant.scroll(
|
||||||
|
collection_name="vacancies",
|
||||||
|
limit=100_000,
|
||||||
|
offset=next_page_offset,
|
||||||
|
with_payload=False,
|
||||||
|
with_vectors=False,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
exist_points_ids.extend([point.id for point in response[0]])
|
||||||
|
next_page_offset = response[1]
|
||||||
|
exist_points_set = tuple(set(exist_points_ids))
|
||||||
|
|
||||||
|
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows
|
||||||
result_rows_len = len(result_rows)
|
result_rows_len = len(result_rows)
|
||||||
for index, row in enumerate(result_rows):
|
for index, row in enumerate(result_rows):
|
||||||
(id, chat_username, telegram_id, message, timestamp) = row
|
(id, chat_username, telegram_id, message, timestamp) = row
|
||||||
|
|
||||||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||||||
print(f"Processing {index}/{result_rows_len} link: {link}")
|
print(f"Processing {index}/{result_rows_len} link: {link}")
|
||||||
start_time = time.time()
|
|
||||||
features = extract_features(message)
|
features = extract_features(message)
|
||||||
print(f"ai time: {time.time() - start_time:.4f}")
|
|
||||||
vacancy, created = Vacancy.objects.get_or_create(
|
vacancy, created = Vacancy.objects.get_or_create(
|
||||||
link=link,
|
link=link,
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
add_vectors(
|
add_vectors(
|
||||||
"vacancies",
|
"vacancies",
|
||||||
vacancy.id,
|
vacancy.id,
|
||||||
features.model_dump(),
|
features.model_dump(),
|
||||||
{'content': message, 'features_json': features.model_dump()},
|
{'content': message, 'features_json': features.model_dump()},
|
||||||
)
|
)
|
||||||
print(f"write vector time: {time.time() - start_time:.4f}")
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
from vacancies.main.vector_store import search_similarities, client
|
from vacancies.main.vector_store import search_similarities
|
||||||
from vacancies.main.models import CustomerCV, RecommendedVacancy
|
from vacancies.main.models import CustomerCV, RecommendedVacancy
|
||||||
from vacancies.main.bot import application
|
from vacancies.main.bot import application
|
||||||
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
||||||
@ -19,12 +20,7 @@ class Command(BaseCommand):
|
|||||||
).values_list('vacancy_id', flat=True)
|
).values_list('vacancy_id', flat=True)
|
||||||
|
|
||||||
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
||||||
search_result_id = search_similarities(query_filter, customer_cv.id)
|
search_result_id, vacancy_content = search_similarities(query_filter, customer_cv.id)
|
||||||
|
|
||||||
vacancy_content = client.retrieve(
|
|
||||||
collection_name="vacancies",
|
|
||||||
ids=[search_result_id],
|
|
||||||
)[0].payload["content"]
|
|
||||||
|
|
||||||
recommendation = RecommendedVacancy.objects.create(
|
recommendation = RecommendedVacancy.objects.create(
|
||||||
customer=customer_cv.customer,
|
customer=customer_cv.customer,
|
||||||
|
|||||||
@ -62,6 +62,7 @@ class RecommendedVacancy(models.Model):
|
|||||||
|
|
||||||
|
|
||||||
class VacancyFeatures(BaseModel):
|
class VacancyFeatures(BaseModel):
|
||||||
|
job_title: str | None = None # Должность
|
||||||
employment_type: str | None = None # Тип занятости
|
employment_type: str | None = None # Тип занятости
|
||||||
work_format: str | None = None # Формат работы
|
work_format: str | None = None # Формат работы
|
||||||
experience: str | None = None # Опыт работы
|
experience: str | None = None # Опыт работы
|
||||||
|
|||||||
@ -9,15 +9,16 @@ from vacancies.main.models import VacancyFeatures
|
|||||||
client = QdrantClient(url="http://localhost:6333")
|
client = QdrantClient(url="http://localhost:6333")
|
||||||
|
|
||||||
FEATURE_NAMES = [
|
FEATURE_NAMES = [
|
||||||
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
||||||
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
||||||
]
|
]
|
||||||
|
|
||||||
weights = {
|
weights = {
|
||||||
|
"job_title": 10,
|
||||||
"employment_type": 2,
|
"employment_type": 2,
|
||||||
"work_format": 2,
|
"work_format": 2,
|
||||||
"experience": 4,
|
"experience": 3,
|
||||||
"position_level": 4,
|
"position_level": 5,
|
||||||
"industry": 4,
|
"industry": 4,
|
||||||
"tech_stack": 5,
|
"tech_stack": 5,
|
||||||
"location": 2,
|
"location": 2,
|
||||||
@ -87,35 +88,40 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
|
|||||||
)[0].vector
|
)[0].vector
|
||||||
|
|
||||||
max_similarities = {}
|
max_similarities = {}
|
||||||
|
vacancies_content = {}
|
||||||
for name, vec in vectors.items():
|
for name, vec in vectors.items():
|
||||||
if any(v != 0 for v in vec):
|
if any(v != 0 for v in vec):
|
||||||
results = client.search(
|
results = client.query_points(
|
||||||
collection_name="vacancies",
|
collection_name="vacancies",
|
||||||
query_vector=(name, vec),
|
query=vec,
|
||||||
|
using=name,
|
||||||
limit=1000,
|
limit=1000,
|
||||||
with_payload=True,
|
with_payload=True,
|
||||||
query_filter=query_filter,
|
query_filter=query_filter,
|
||||||
)
|
)
|
||||||
for res in results:
|
for res in results.points:
|
||||||
vid = res.id
|
vid = res.id
|
||||||
sim = res.score
|
sim = res.score
|
||||||
if vid not in max_similarities:
|
if vid not in max_similarities:
|
||||||
max_similarities[vid] = {}
|
max_similarities[vid] = {}
|
||||||
max_similarities[vid][name] = sim
|
max_similarities[vid][name] = sim
|
||||||
|
if vid not in vacancies_content:
|
||||||
|
vacancies_content[vid] = res.payload["content"]
|
||||||
|
|
||||||
scored = []
|
scored = []
|
||||||
for vid, feature_sims in max_similarities.items():
|
for vid, feature_sims in max_similarities.items():
|
||||||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||||||
scored.append({"id": vid, "score": total})
|
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
|
||||||
|
|
||||||
scored.sort(key=lambda x: x["score"], reverse=True)
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
||||||
return scored[0]["id"]
|
return scored[0]["id"], scored[0]["content"]
|
||||||
|
|
||||||
|
|
||||||
def extract_features(content: str) -> VacancyFeatures:
|
def extract_features(content: str) -> VacancyFeatures:
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
||||||
Features:
|
Features:
|
||||||
|
- job_title: Должность (e.g., DevOps, Python программист)
|
||||||
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
|
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
|
||||||
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
|
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
|
||||||
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
|
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user