Home crawling-task-8
Post
Cancel

crawling-task-8

reddit 긁어 오기

main.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests
from flask import Flask, render_template, request
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}

subreddits = [
    "javascript",
    "reactjs",
    "reactnative",
    "programming",
    "css",
    "golang",
    "flutter",
    "rust",
    "django"
]

def get_info(data,select):
  upvotes=data.find("div",{"class":"_1rZYMD_4xY3gRcSS3p8ODO"})
  title=data.find("h3",{"class":"_eYtD2XCVieq6emjKBH3m"})
  url=data.find("a",{"class":"SQnoC3ObvgnGjWt90zD9Z"})
  if url is not None:
    if upvotes:
      upvotes=upvotes.get_text()
      if title:
        title=title.get_text()
        if url: 
          url="https://reddit.com"+url.get('href')
          return {"title" : title, "upvotes" : upvotes, "url" : url,"select" : select}

app = Flask("DayEleven")

@app.route("/")
def home():
  return render_template("home.html",subreddits=subreddits)

@app.route("/read")
def read():
  selects = request.args.to_dict().keys()
  # input name : on

  info = []
  for select in selects:
    res = requests.get(f"https://www.reddit.com/{select}/top/?t=month",headers=headers)
    datas = BeautifulSoup(res.text,"html.parser").find_all("div",{"class":"_1oQyIsiPHYt6nx7VOmd1sz"})

    for data in datas:
      info.append(get_info(data,select))

    # 방어코드
    info = list(filter(None,info))

    for i in info:
      if type(i["upvotes"]) is str: 
        if i["upvotes"].find("k") != -1:
          i["upvotes"] = int(float(i["upvotes"].replace("k",""))*1000)
        else:
          i["upvotes"] = int(i["upvotes"])

  info_mixed=sorted(info, key=lambda vote: (vote["upvotes"]), reverse=True)

  return render_template("read.html",lists = selects, info = info_mixed)

app.run(host="0.0.0.0")

templates

  • home.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
<!DOCTYPE html>
<html>

<head>
  <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"></link>
</head>

<body>
  <header>
    <h1>Reddit Reader</h1>
    <p>A service to aggregate all your favorite subreddits</p>
  </header>
  <main>
    <form action="/read">
      <h4>Select the subreddits you're interested on:</h4>
      <ul>
        &#123;% for subreddit in subreddits %&#125;
          <li>
            <input type="checkbox" id="&#123;&#123;subreddit&#125;&#125;" name="r/&#123;&#123;subreddit&#125;&#125;">
            <label for="&#123;&#123;subreddit&#125;&#125;">r/&#123;&#123;subreddit&#125;&#125;</label>
          </li>
        &#123;% endfor %&#125;
      </ul>
      <button type="submit">Aggregate</button>
    </form>
  </main>
</body>

</html>
  • read.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
<!DOCTYPE html>
<html>

<head>
  <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"></link>
</head>

<body>
  <header>
    <h1>
      <a href="/">Reddit Reader</a>
    </h1>
    <h3>Reading:
      &#123;%for list in lists %&#125;
        &#123;&#123;list&#125;&#125;
      &#123;% endfor %&#125;
    </h3>
  </header>
  <main>
    &#123;% for i in info %&#125;
      <div>
        <h3>
          <a href="&#123;&#123;i.url&#125;&#125;" target="_blank">&#123;&#123;i.title&#125;&#125;</a>
        </h3>
        <h4>&#123;&#123;i.upvotes&#125;&#125; upvotes · &#123;&#123;i.select&#125;&#125;</h4>
        <hr>
      </div>
    &#123;% endfor %&#125;
  </main>
  <footer>
    <button type="button" onclick="location.href='/'">Back to Home</button>
  </footer>
</body>

</html>

메모

  • 가져온 내용이 json인지 html인지 구분해서 soup필요 여부 판단

  • 어려웠던 부분

    • 가져오는 사이트마다 구조가 다르다는 점
    • None type 방어하는 방법
    • 파이썬 문법
This post is licensed under CC BY 4.0 by the author.