Integrate article parser

This commit is contained in:
2022-11-30 09:20:35 +03:00
parent 8cee3406ea
commit b8eb2b9bf5
7 changed files with 44 additions and 21 deletions

View File

@ -1,7 +1,5 @@
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from django.conf import settings from newspaper import Article, Config
def sanitize_img_size(html: str): def sanitize_img_size(html: str):
@ -13,10 +11,10 @@ def sanitize_img_size(html: str):
def get_content(url: str): def get_content(url: str):
if settings.READABILITY_HOST: config = Config()
url = settings.READABILITY_HOST.rstrip( config.keep_article_html = True
'/') + '/api/content/v1/parser?url=' + url article = Article(url, config=config)
try: article.download()
return requests.get(url).json() article.parse()
except KeyError:
return None return article

View File

@ -0,0 +1,18 @@
# Generated by Django 4.1.3 on 2022-11-30 06:10
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('later42', '0006_remove_url_content_remove_url_title_article_short_and_more'),
]
operations = [
migrations.AddField(
model_name='article',
name='img',
field=models.URLField(blank=True, null=True),
),
]

View File

@ -8,3 +8,4 @@ class Article(models.Model):
content = models.TextField(blank=True, null=True) content = models.TextField(blank=True, null=True)
title = models.CharField(max_length=2000, blank=True, null=True) title = models.CharField(max_length=2000, blank=True, null=True)
short = models.TextField(blank=True, null=True) short = models.TextField(blank=True, null=True)
img = models.URLField(blank=True, null=True)

View File

@ -21,8 +21,6 @@ if AIRBRAKE_PROJECT_ID is not None and AIRBRAKE_PROJECT_KEY is not None:
@shared_task() @shared_task()
def get_url_content_task(url, user_id): def get_url_content_task(url, user_id):
print(url)
print(user_id)
user = User.objects.get(pk=int(user_id)) user = User.objects.get(pk=int(user_id))
url_object = URL(url=url, user=user) url_object = URL(url=url, user=user)
url_object.save() url_object.save()
@ -30,7 +28,10 @@ def get_url_content_task(url, user_id):
data = get_content(url) data = get_content(url)
article = Article.objects.create(url=url_object) article = Article.objects.create(url=url_object)
article.content = data['rich_content'] article.content = data.article_html
article.title = data['title'] article.title = data.title
article.short = data['excerpt'] article.short = data.text[:150]
if data.has_top_image():
article.img = data.top_image
article.save() article.save()

View File

@ -15,6 +15,13 @@
<a href="{% url 'archive_url' d.url_id %}"><span class="fa-regular fa-square-check"></span></a> <a href="{% url 'archive_url' d.url_id %}"><span class="fa-regular fa-square-check"></span></a>
</div> </div>
</div> </div>
{% if d.img %}
<p class="post-preview">
<a href="{% url 'reader' d.url_id %}">
<img class="img-fluid" src="{{ d.img }}" width="100%" height="100%" />
</a>
</p>
{% endif %}
{% if d.short %} {% if d.short %}
<p class="post-meta"> <p class="post-meta">
{{ d.short }} {{ d.short }}

View File

@ -1,8 +1,5 @@
from multiprocessing import context
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.shortcuts import render, redirect from django.shortcuts import render
from django.core.paginator import Paginator
from django.conf import settings
from later42.libs.content import get_content, sanitize_img_size from later42.libs.content import get_content, sanitize_img_size
from later42.models.article import Article from later42.models.article import Article
from later42.models.urls import URL from later42.models.urls import URL
@ -15,11 +12,11 @@ def get(request, url_id=None):
content = {} content = {}
try: try:
article = Article.objects.get(url=url) article = Article.objects.get(url=url)
content['title'] = url.title content['title'] = article.title
content['url'] = url.url content['url'] = url.url
content['rich_content'] = sanitize_img_size(article.content) content['rich_content'] = sanitize_img_size(article.content)
except: except:
content = get_content(url.url) content = get_content(url.url)
content['rich_content'] = sanitize_img_size(content['rich_content']) content['rich_content'] = sanitize_img_size(content.article_html)
context = {'url': url, 'content': content} context = {'url': url, 'content': content}
return render(request, 'reader.html', context) return render(request, 'reader.html', context)

View File

@ -10,3 +10,4 @@ six==1.16.0
celery[redis]==5.2.7 celery[redis]==5.2.7
pybrake==1.10.0 pybrake==1.10.0
sentry-sdk==1.11.0 sentry-sdk==1.11.0
newspaper3k==0.2.8