My Tech Life

Memo by a Japanese Software Developer in his late 50s.

Python script to check the differences between the followers and followed blogs of a user on a Japanese blog site.

This program checks

the differences between the favorite blogs and reader blogs

of an user of Ameblo, one of the most famous Japanese blog site.

Set the user in the following line:

 

ameblo_user = 'XXXXXXXXXXXXXX' 

 

This script retrieves

links only from the favorite blogs and reader blogs

that the specified user has made public.

It does not retrieve private information or require login credentials.

 

To check for differences, it utilizes set (collection) difference checking.

 

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import re

def get_html_content(url):
  html_content = ''
  try:
    print('fetching url', url)
    q = Request(url)
    html = urlopen(q, timeout=15)

    temp = html.read()
    detect_enc = chardet.detect(temp)['encoding']
    if detect_enc is None:
      detect_enc = 'utf-8'
    elif detect_enc == 'Windows-1254':
      detect_enc = 'utf-8'
    html_content = temp.decode(detect_enc, errors='ignore')

  except Exception as e:
    print('fetching url failed', url, repr(e))

  return html_content

def get_site_links(site_url):
  total_links = [ ]
  links_backup = [ ]
  for page_no in range(1, 1000):
    site = site_url.replace('.html', '-' + str(page_no) + '.html')
    print(site)
    html_content = get_html_content(site)
    if not html_content:
      break
    else:
      soup = BeautifulSoup(html_content, 'html.parser')
      #get_text = soup.get_text()
      #print(get_text)
  
      links = [ ]
      for link in soup.find_all('a'):
        href = link.get('href')
        text = link.get_text()
        if href:
          absolute_link = urljoin(base_url, href)
          if my_site in absolute_link:
            continue
          if base_url not in absolute_link:
            continue
          text = re.sub('更新$', '', text)
          links.append([absolute_link, text])

      if links_backup == links:
        return total_links
        
      links_backup = links

      #for link in links:
      #  print(link)

      total_links.extend(links)

    time.sleep(1)

# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX' 

ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)

parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)

total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
  site_url = my_site + page
  rets = get_site_links(site_url)

  links = [ret[0] for ret in rets] 
  total_links.append(links)

  names = [ret[1] for ret in rets] 
  total_names.append(names)

subs = [(0, 1), (1, 0)]
for sub in subs:
  sub0 = sub[0]
  sub1 = sub[1]
  diff_list = set(total_names[sub0]) - set(total_names[sub1])
  print(len(diff_list))
  for elem in diff_list:
    print(elem)

 

 

 

アメブロユーザのお気に入りブログと読者ブログの差分を確認するPythonスクリプト

アメブロユーザの

お気に入りブログと読者ブログの差分を

確認するプログラム。

 

以下の行にユーザを設定する。

 

ameblo_user = 'XXXXXXXXXXXXXX' 

 

このスクリプトは、

指定されたユーザーが公開している、

お気に入りブログと読者ブログのリンクのみを取得する。

非公開の情報やログインが必要な情報は取得しない。

 

差分確認には、

セット(集合)の差分チェックを使っている。

 

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse, urljoin
import chardet
import time
import re

def get_html_content(url):
  html_content = ''
  try:
    print('fetching url', url)
    q = Request(url)
    html = urlopen(q, timeout=15)

    temp = html.read()
    detect_enc = chardet.detect(temp)['encoding']
    if detect_enc is None:
      detect_enc = 'utf-8'
    elif detect_enc == 'Windows-1254':
      detect_enc = 'utf-8'
    html_content = temp.decode(detect_enc, errors='ignore')

  except Exception as e:
    print('fetching url failed', url, repr(e))

  return html_content

def get_site_links(site_url):
  total_links = [ ]
  links_backup = [ ]
  for page_no in range(1, 1000):
    site = site_url.replace('.html', '-' + str(page_no) + '.html')
    print(site)
    html_content = get_html_content(site)
    if not html_content:
      break
    else:
      soup = BeautifulSoup(html_content, 'html.parser')
      #get_text = soup.get_text()
      #print(get_text)
  
      links = [ ]
      for link in soup.find_all('a'):
        href = link.get('href')
        text = link.get_text()
        if href:
          absolute_link = urljoin(base_url, href)
          if my_site in absolute_link:
            continue
          if base_url not in absolute_link:
            continue
          text = re.sub('更新$', '', text)
          links.append([absolute_link, text])

      if links_backup == links:
        return total_links
        
      links_backup = links

      #for link in links:
      #  print(link)

      total_links.extend(links)

    time.sleep(1)

# ----- Main -----
ameblo_user = 'XXXXXXXXXXXXXX' 

ameblo_site = 'https://ameblo.jp/'
my_site = '{}{}'.format(ameblo_site, ameblo_user)

parsed_url = urlparse(my_site)
base_url = '{}://{}'.format(parsed_url.scheme, parsed_url.netloc)

total_names = [ ]
total_links = [ ]
target_pages = ['/favorite.html','/reader.html']
for page in target_pages:
  site_url = my_site + page
  rets = get_site_links(site_url)

  links = [ret[0] for ret in rets] 
  total_links.append(links)

  names = [ret[1] for ret in rets] 
  total_names.append(names)

subs = [(0, 1), (1, 0)]
for sub in subs:
  sub0 = sub[0]
  sub1 = sub[1]
  diff_list = set(total_names[sub0]) - set(total_names[sub1])
  print(len(diff_list))
  for elem in diff_list:
    print(elem)

 

 

 

How to group emails by week into arrays in Python

When conducting data analysis,

deciding on the unit for aggregating data

is crucial.

 

Currently,

in the midst of analyzing data

obtained from bulk email collection,

it seems beneficial

to aggregate the data on a weekly basis.

 

Here's a sample scenario:

 

The input data is prepared as a two-dimensional array,

with the first element representing the date

and

the second element representing the sample text.

 

As preparation:

  • Format the dates and convert them into arrays.
  • Separate the texts into arrays.

 

Then,

determine the day of the week for each date,

and

create keys using the year-month combination

along with the day of the week,

and hash them.

 

The hash will have sub-keys for the date and text,

forming an array.

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from datetime import datetime

data = [
    ["Thu, 18 May 2017 02:24:34 +0000","This is mail 1"],
    ["Thu, 08 Jun 2017 23:13:22 +0000","This is mail 2"],
    ["Mon, 25 Dec 2017 03:12:33 +0000","This is mail 3"],
    ["Wed, 13 Sep 2017 22:46:44 -0700","This is mail 4"],
    ["Wed, 13 Sep 2017 20:11:18 -0700","This is mail 5"],
    ["Wed, 13 Sep 2017 10:19:19 -0700","This is mail 6"],
    ["Tue, 12 Sep 2017 22:59:06 -0700","This is mail 7"]
]

datetime_dates = [datetime.strptime(datum[0], "%a, %d %b %Y %H:%M:%S %z") for datum in data]
texts = [datum[1] for datum in data]

week_groups = { }
for (date, text) in zip(datetime_dates, texts):
    date_key = (date.year, date.month, date.isocalendar()[1])
    if date_key not in week_groups:
        week_groups[date_key] = { }
        week_groups[date_key]['dates'] = [ ]
        week_groups[date_key]['texts'] = [ ]
    week_groups[date_key]['dates'].append(date)
    week_groups[date_key]['texts'].append(text)

for key in sorted(week_groups.keys()):
    print(key)
    dates = week_groups[key]['dates']
    texts = week_groups[key]['texts']
    for (date, text) in zip(dates, texts):
        print(date, text)

 

Here's the result.

(base) C:\pytest>python test6.py
(2017, 5, 20)
2017-05-18 02:24:34+00:00 This is mail 1
(2017, 6, 23)
2017-06-08 23:13:22+00:00 This is mail 2
(2017, 9, 37)
2017-09-13 22:46:44-07:00 This is mail 4
2017-09-13 20:11:18-07:00 This is mail 5
2017-09-13 10:19:19-07:00 This is mail 6
2017-09-12 22:59:06-07:00 This is mail 7
(2017, 12, 52)
2017-12-25 03:12:33+00:00 This is mail 3

 

Pythonでメールを週ごとに配列にまとめる

 データ分析する時に、

どの単位でデータをまとめるか。

 

メールの一括収集でできるデータ分析を

している最中ですが、

 

週ごとにメールをまとめるのが

よさそうでした。

 

これはサンプル。

dataは2次元配列として準備、

1要素目が日付、

2要素目が本文のサンプルです。

 

準備として、

日付をフォーマットして、配列化。

テキストも別に配列化。

 

日付から、年の何番目の曜日かを探し、

 年月+曜日でキーを作り、ハッシュ化。

ハッシュは、日付とテキストのサブキーを持ち、配列とする。

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from datetime import datetime

data = [
    ["Thu, 18 May 2017 02:24:34 +0000","This is mail 1"],
    ["Thu, 08 Jun 2017 23:13:22 +0000","This is mail 2"],
    ["Mon, 25 Dec 2017 03:12:33 +0000","This is mail 3"],
    ["Wed, 13 Sep 2017 22:46:44 -0700","This is mail 4"],
    ["Wed, 13 Sep 2017 20:11:18 -0700","This is mail 5"],
    ["Wed, 13 Sep 2017 10:19:19 -0700","This is mail 6"],
    ["Tue, 12 Sep 2017 22:59:06 -0700","This is mail 7"]
]

datetime_dates = [datetime.strptime(datum[0], "%a, %d %b %Y %H:%M:%S %z") for datum in data]
texts = [datum[1] for datum in data]

week_groups = { }
for (date, text) in zip(datetime_dates, texts):
    date_key = (date.year, date.month, date.isocalendar()[1])
    if date_key not in week_groups:
        week_groups[date_key] = { }
        week_groups[date_key]['dates'] = [ ]
        week_groups[date_key]['texts'] = [ ]
    week_groups[date_key]['dates'].append(date)
    week_groups[date_key]['texts'].append(text)

for key in sorted(week_groups.keys()):
    print(key)
    dates = week_groups[key]['dates']
    texts = week_groups[key]['texts']
    for (date, text) in zip(dates, texts):
        print(date, text)

実行結果です。

(base) C:\pytest>python test6.py
(2017, 5, 20)
2017-05-18 02:24:34+00:00 This is mail 1
(2017, 6, 23)
2017-06-08 23:13:22+00:00 This is mail 2
(2017, 9, 37)
2017-09-13 22:46:44-07:00 This is mail 4
2017-09-13 20:11:18-07:00 This is mail 5
2017-09-13 10:19:19-07:00 This is mail 6
2017-09-12 22:59:06-07:00 This is mail 7
(2017, 12, 52)
2017-12-25 03:12:33+00:00 This is mail 3