My Tech Life

Memo by a Japanese Software Developer in his late 50s.

Python script to check the differences between the followers and followed blogs of a user on a Japanese blog site.

This program checks

the differences between the favorite blogs and reader blogs

of an user of Ameblo, one of the most famous Japanese blog site.

Set the user in the following line:

 

ameblo_user = 'XXXXXXXXXXXXXX' 

 

This script retrieves

links only from the favorite blogs and reader blogs

that the specified user has made public.

It does not retrieve private information or require login credentials.

 

To check for differences, it utilizes set (collection) difference checking.

 

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import re

def get_html_content(url):
  html_content = ''
  try:
    print('fetching url', url)
    q = Request(url)
    html = urlopen(q, timeout=15)

    temp = html.read()
    detect_enc = chardet.detect(temp)['encoding']
    if detect_enc is None:
      detect_enc = 'utf-8'
    elif detect_enc == 'Windows-1254':
      detect_enc = 'utf-8'
    html_content = temp.decode(detect_enc, errors='ignore')

  except Exception as e:
    print('fetching url failed', url, repr(e))

  return html_content

def get_site_links(site_url):
  total_links = [ ]
  links_backup = [ ]
  for page_no in range(1, 1000):
    site = site_url.replace('.html', '-' + str(page_no) + '.html')
    print(site)
    html_content = get_html_content(site)
    if not html_content:
      break
    else:
      soup = BeautifulSoup(html_content, 'html.parser')
      #get_text = soup.get_text()
      #print(get_text)
  
      links = [ ]
      for link in soup.find_all('a'):
        href = link.get('href')
        text = link.get_text()
        if href:
          absolute_link = urljoin(base_url, href)
          if my_site in absolute_link:
            continue
          if base_url not in absolute_link:
            continue
          text = re.sub('更新$', '', text)
          links.append([absolute_link, text])

      if links_backup == links:
        return total_links
        
      links_backup = links

      #for link in links:
      #  print(link)

      total_links.extend(links)

    time.sleep(1)

# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX' 

ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)

parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)

total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
  site_url = my_site + page
  rets = get_site_links(site_url)

  links = [ret[0] for ret in rets] 
  total_links.append(links)

  names = [ret[1] for ret in rets] 
  total_names.append(names)

subs = [(0, 1), (1, 0)]
for sub in subs:
  sub0 = sub[0]
  sub1 = sub[1]
  diff_list = set(total_names[sub0]) - set(total_names[sub1])
  print(len(diff_list))
  for elem in diff_list:
    print(elem)