crawling-task-8

reddit 긁어 오기

main.py

import requests
from flask import Flask, render_template, request
from bs4 import BeautifulSoup

headers = &#123;'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'&#125;

subreddits = [
    "javascript",
    "reactjs",
    "reactnative",
    "programming",
    "css",
    "golang",
    "flutter",
    "rust",
    "django"
]

def get_info(data,select):
  upvotes=data.find("div",&#123;"class":"_1rZYMD_4xY3gRcSS3p8ODO"&#125;)
  title=data.find("h3",&#123;"class":"_eYtD2XCVieq6emjKBH3m"&#125;)
  url=data.find("a",&#123;"class":"SQnoC3ObvgnGjWt90zD9Z"&#125;)
  if url is not None:
    if upvotes:
      upvotes=upvotes.get_text()
      if title:
        title=title.get_text()
        if url: 
          url="https://reddit.com"+url.get('href')
          return &#123;"title" : title, "upvotes" : upvotes, "url" : url,"select" : select&#125;

app = Flask("DayEleven")

@app.route("/")
def home():
  return render_template("home.html",subreddits=subreddits)

@app.route("/read")
def read():
  selects = request.args.to_dict().keys()
  # input name : on

  info = []
  for select in selects:
    res = requests.get(f"https://www.reddit.com/&#123;select&#125;/top/?t=month",headers=headers)
    datas = BeautifulSoup(res.text,"html.parser").find_all("div",&#123;"class":"_1oQyIsiPHYt6nx7VOmd1sz"&#125;)

    for data in datas:
      info.append(get_info(data,select))

    # 방어코드
    info = list(filter(None,info))

    for i in info:
      if type(i["upvotes"]) is str: 
        if i["upvotes"].find("k") != -1:
          i["upvotes"] = int(float(i["upvotes"].replace("k",""))*1000)
        else:
          i["upvotes"] = int(i["upvotes"])

  info_mixed=sorted(info, key=lambda vote: (vote["upvotes"]), reverse=True)

  return render_template("read.html",lists = selects, info = info_mixed)

app.run(host="0.0.0.0")

templates

home.html

<!DOCTYPE html>
<html>

<head>
  <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"></link>
</head>

<body>
  <header>
    <h1>Reddit Reader</h1>
    <p>A service to aggregate all your favorite subreddits</p>
  </header>
  <main>
    <form action="/read">
      <h4>Select the subreddits you're interested on:</h4>
      <ul>
        &#123;% for subreddit in subreddits %&#125;
          <li>
            <input type="checkbox" id="&#123;&#123;subreddit&#125;&#125;" name="r/&#123;&#123;subreddit&#125;&#125;">
            <label for="&#123;&#123;subreddit&#125;&#125;">r/&#123;&#123;subreddit&#125;&#125;</label>
          </li>
        &#123;% endfor %&#125;
      </ul>
      <button type="submit">Aggregate</button>
    </form>
  </main>
</body>

</html>

read.html

<!DOCTYPE html>
<html>

<head>
  <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"></link>
</head>

<body>
  <header>
    <h1>
      <a href="/">Reddit Reader</a>
    </h1>
    <h3>Reading:
      &#123;%for list in lists %&#125;
        &#123;&#123;list&#125;&#125;
      &#123;% endfor %&#125;
    </h3>
  </header>
  <main>
    &#123;% for i in info %&#125;
      <div>
        <h3>
          <a href="&#123;&#123;i.url&#125;&#125;" target="_blank">&#123;&#123;i.title&#125;&#125;</a>
        </h3>
        <h4>&#123;&#123;i.upvotes&#125;&#125; upvotes · &#123;&#123;i.select&#125;&#125;</h4>
        <hr>
      </div>
    &#123;% endfor %&#125;
  </main>
  <footer>
    <button type="button" onclick="location.href='/'">Back to Home</button>
  </footer>
</body>

</html>

메모

가져온 내용이 json인지 html인지 구분해서 soup필요 여부 판단
어려웠던 부분
- 가져오는 사이트마다 구조가 다르다는 점
- None type 방어하는 방법
- 파이썬 문법

crawling-task-8

reddit 긁어 오기

main.py

templates

메모

Further Reading

crawling-task-7

crawling-task-1

crawling-task-2

Trending Tags