2023年11月21日火曜日

ウェブスクレイピング (selenium - requests)

# ウェブスクレイピング
# python + selenimu + webDriver + BeautifulSoup
#
#python 3.8+
#
#conda install -c conda-forge selenium==4.15.1 
#   → https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
#conda install -c anaconda beautifulsoup4==4.9.1

import sys
import time
import datetime
import traceback
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import requests as rq
import ssl, urllib3

##################################################
class CustomHttpAdapter (rq.adapters.HTTPAdapter):
  def __init__(self, ssl_context=None, **kwargs):
    self.ssl_context = ssl_context
    super().__init__(**kwargs)
 
  def init_poolmanager(self, connections, maxsize, block=False):
    self.poolmanager = urllib3.poolmanager.PoolManager(
         num_pools=connections
        ,maxsize=maxsize
        ,block=block
        ,ssl_context=self.ssl_context)

##################################################
def login_post():
  # open web browser
  print('open web browser')
  options = webdriver.EdgeOptions()
  options.add_argument("headless")
  options.add_argument('log-level=3') # INFO = 0, WARNING = 1, LOG_ERROR = 2, LOG_FATAL = 3.
  service = webdriver.EdgeService(executable_path='./msedgedriver.exe', service_args=['--log-level=SEVERE'])

  driver = webdriver.Edge(service=service, options = options)
  driver.set_window_size('1200', '1000')
  
  # login
  print ('login')
  driver.get('url')
  time.sleep(1)
  print (driver.current_url)
  driver.find_element(By.ID, 'user-name').send_keys('id')
  driver.find_element(By.ID, 'password').send_keys('pass')
  driver.find_element(By.ID, 'login').click()
  time.sleep(2)

  with rq.Session() as s:
    ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
    ctx.options |= 0x4
    s.mount('https://', CustomHttpAdapter(ctx))
    for cookies in driver.get_cookies(): # cookiejar
      s.cookies.set(cookies["name"], cookies["value"], **{"domain":cookies["domain"] ,"path":cookies["path"]})
    st = s.post('url',data=data)
    print (st.text)

  # close web browser
  print ('close web browser')
  driver.close()
  driver.quit()

0 件のコメント: