Integrate article parser
This commit is contained in:
@ -1,7 +1,5 @@
|
|||||||
import requests
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from django.conf import settings
|
from newspaper import Article, Config
|
||||||
|
|
||||||
|
|
||||||
def sanitize_img_size(html: str):
|
def sanitize_img_size(html: str):
|
||||||
@ -13,10 +11,10 @@ def sanitize_img_size(html: str):
|
|||||||
|
|
||||||
|
|
||||||
def get_content(url: str):
|
def get_content(url: str):
|
||||||
if settings.READABILITY_HOST:
|
config = Config()
|
||||||
url = settings.READABILITY_HOST.rstrip(
|
config.keep_article_html = True
|
||||||
'/') + '/api/content/v1/parser?url=' + url
|
article = Article(url, config=config)
|
||||||
try:
|
article.download()
|
||||||
return requests.get(url).json()
|
article.parse()
|
||||||
except KeyError:
|
|
||||||
return None
|
return article
|
||||||
|
18
later42/migrations/0007_article_img.py
Normal file
18
later42/migrations/0007_article_img.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 4.1.3 on 2022-11-30 06:10
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('later42', '0006_remove_url_content_remove_url_title_article_short_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='article',
|
||||||
|
name='img',
|
||||||
|
field=models.URLField(blank=True, null=True),
|
||||||
|
),
|
||||||
|
]
|
@ -8,3 +8,4 @@ class Article(models.Model):
|
|||||||
content = models.TextField(blank=True, null=True)
|
content = models.TextField(blank=True, null=True)
|
||||||
title = models.CharField(max_length=2000, blank=True, null=True)
|
title = models.CharField(max_length=2000, blank=True, null=True)
|
||||||
short = models.TextField(blank=True, null=True)
|
short = models.TextField(blank=True, null=True)
|
||||||
|
img = models.URLField(blank=True, null=True)
|
||||||
|
@ -21,8 +21,6 @@ if AIRBRAKE_PROJECT_ID is not None and AIRBRAKE_PROJECT_KEY is not None:
|
|||||||
|
|
||||||
@shared_task()
|
@shared_task()
|
||||||
def get_url_content_task(url, user_id):
|
def get_url_content_task(url, user_id):
|
||||||
print(url)
|
|
||||||
print(user_id)
|
|
||||||
user = User.objects.get(pk=int(user_id))
|
user = User.objects.get(pk=int(user_id))
|
||||||
url_object = URL(url=url, user=user)
|
url_object = URL(url=url, user=user)
|
||||||
url_object.save()
|
url_object.save()
|
||||||
@ -30,7 +28,10 @@ def get_url_content_task(url, user_id):
|
|||||||
data = get_content(url)
|
data = get_content(url)
|
||||||
|
|
||||||
article = Article.objects.create(url=url_object)
|
article = Article.objects.create(url=url_object)
|
||||||
article.content = data['rich_content']
|
article.content = data.article_html
|
||||||
article.title = data['title']
|
article.title = data.title
|
||||||
article.short = data['excerpt']
|
article.short = data.text[:150]
|
||||||
|
if data.has_top_image():
|
||||||
|
article.img = data.top_image
|
||||||
|
|
||||||
article.save()
|
article.save()
|
||||||
|
@ -15,6 +15,13 @@
|
|||||||
<a href="{% url 'archive_url' d.url_id %}"><span class="fa-regular fa-square-check"></span></a>
|
<a href="{% url 'archive_url' d.url_id %}"><span class="fa-regular fa-square-check"></span></a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{% if d.img %}
|
||||||
|
<p class="post-preview">
|
||||||
|
<a href="{% url 'reader' d.url_id %}">
|
||||||
|
<img class="img-fluid" src="{{ d.img }}" width="100%" height="100%" />
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
{% endif %}
|
||||||
{% if d.short %}
|
{% if d.short %}
|
||||||
<p class="post-meta">
|
<p class="post-meta">
|
||||||
{{ d.short }}
|
{{ d.short }}
|
||||||
|
@ -1,8 +1,5 @@
|
|||||||
from multiprocessing import context
|
|
||||||
from django.contrib.auth.decorators import login_required
|
from django.contrib.auth.decorators import login_required
|
||||||
from django.shortcuts import render, redirect
|
from django.shortcuts import render
|
||||||
from django.core.paginator import Paginator
|
|
||||||
from django.conf import settings
|
|
||||||
from later42.libs.content import get_content, sanitize_img_size
|
from later42.libs.content import get_content, sanitize_img_size
|
||||||
from later42.models.article import Article
|
from later42.models.article import Article
|
||||||
from later42.models.urls import URL
|
from later42.models.urls import URL
|
||||||
@ -15,11 +12,11 @@ def get(request, url_id=None):
|
|||||||
content = {}
|
content = {}
|
||||||
try:
|
try:
|
||||||
article = Article.objects.get(url=url)
|
article = Article.objects.get(url=url)
|
||||||
content['title'] = url.title
|
content['title'] = article.title
|
||||||
content['url'] = url.url
|
content['url'] = url.url
|
||||||
content['rich_content'] = sanitize_img_size(article.content)
|
content['rich_content'] = sanitize_img_size(article.content)
|
||||||
except:
|
except:
|
||||||
content = get_content(url.url)
|
content = get_content(url.url)
|
||||||
content['rich_content'] = sanitize_img_size(content['rich_content'])
|
content['rich_content'] = sanitize_img_size(content.article_html)
|
||||||
context = {'url': url, 'content': content}
|
context = {'url': url, 'content': content}
|
||||||
return render(request, 'reader.html', context)
|
return render(request, 'reader.html', context)
|
||||||
|
@ -10,3 +10,4 @@ six==1.16.0
|
|||||||
celery[redis]==5.2.7
|
celery[redis]==5.2.7
|
||||||
pybrake==1.10.0
|
pybrake==1.10.0
|
||||||
sentry-sdk==1.11.0
|
sentry-sdk==1.11.0
|
||||||
|
newspaper3k==0.2.8
|
||||||
|
Reference in New Issue
Block a user