vision-career/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py

45 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.core.management import BaseCommand
from vacancies.main.models import Vacancy
import clickhouse_connect
from langchain_core.documents import Document
from vacancies.main.vector_store import vector_store
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
query = """
SELECT id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage
WHERE timestamp >= now() - INTERVAL 30 DAY
AND length(message) > 150
AND arrayCount(x -> position(message, x) > 0, [
'вакансия', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
'зарплата', 'оклад', 'з/п', 'руб', 'опыт работы',
'требования', 'обязанности', 'условия', 'компания', 'офис',
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
'резюме', 'собеседование', 'junior', 'middle', 'senior'
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
"""
class Command(BaseCommand):
help = "Collect vacancies from telegram messages"
def handle(self, *args, **options):
documents = []
for index, row in enumerate(clickhouse_client.query(query).result_rows):
(id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/c/{chat_username}/{telegram_id}"
vacancy, created = Vacancy.objects.get_or_create(
link=link,
defaults={'content': message}
)
metadata = {"link": link, "vacancy_id": vacancy.id}
documents.append(Document(page_content=message, metadata=metadata))
print(index, link)
vector_store.add_documents(documents)