优化正则等,修改逻辑,避免被覆盖

to learn goupby

learn pandas groupby

groupby

learn pandas groupby

优化正则提取番号和集数

待理解下载图片逻辑

还有剪裁+背景图逻辑

修改所有config[

将整理生成nfo的代码

可缓存番号信息和缩略图和海报

可以识别番号后集数和尾部集数,赞不能分辨-C中文字幕片

改正一个错误

嵌套字典存储数据

整理函数

修正匹配时间正则

pipenv 添加依赖

修改优先取三位数字的规则:heyzo四位数除外

添加了依赖 和 有番号的优化

修改了啥 我也记不得了
This commit is contained in:
Tan Peng 2020-03-25 01:45:55 +08:00 committed by leo
parent 32a19bb989
commit fc13f88731
54 changed files with 3138 additions and 1951 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
*.DS_Store
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]

2
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

8
.idea/AV_Data_Capture.iml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (AV_Data_Capture)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

19
.idea/dictionaries/tanpengsccd.xml generated Normal file
View File

@ -0,0 +1,19 @@
<component name="ProjectDictionaryState">
<dictionary name="tanpengsccd">
<words>
<w>avsox</w>
<w>emby</w>
<w>fanart</w>
<w>fanza</w>
<w>javbus</w>
<w>javdb</w>
<w>jellyfin</w>
<w>khtml</w>
<w>kodi</w>
<w>mgstage</w>
<w>plex</w>
<w>pondo</w>
<w>rmvb</w>
</words>
</dictionary>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml generated Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (AV_Data_Capture)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/AV_Data_Capture.iml" filepath="$PROJECT_DIR$/.idea/AV_Data_Capture.iml" />
</modules>
</component>
</project>

6
.idea/other.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PySciProjectComponent">
<option name="PY_SCI_VIEW_SUGGESTED" value="true" />
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -1,136 +1,127 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests import requests
from configparser import ConfigParser from configparser import ConfigParser
import os import os
import re import re
import time import time
import sys import sys
from lxml import etree from lxml import etree
import sys import sys
import io import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) from ConfigApp import ConfigApp
# sys.setdefaultencoding('utf-8') # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
# sys.setdefaultencoding('utf-8')
config_file='config.ini'
config = ConfigParser() # config_file='config.ini'
# config = ConfigParser()
if os.path.exists(config_file):
try: # if os.path.exists(config_file):
config.read(config_file, encoding='UTF-8') # try:
except: # config.read(config_file, encoding='UTF-8')
print('[-]Config.ini read failed! Please use the offical file!') # except:
else: # print('[-]Config.ini read failed! Please use the offical file!')
print('[+]config.ini: not found, creating...',end='') # else:
with open("config.ini", "wt", encoding='UTF-8') as code: # print('[+]config.ini: not found, creating...',end='')
print("[common]", file=code) # with open("config.ini", "wt", encoding='UTF-8') as code:
print("main_mode = 1", file=code) # print("[common]", file=code)
print("failed_output_folder = failed", file=code) # print("main_mode = 1", file=code)
print("success_output_folder = JAV_output", file=code) # print("failed_output_folder = failed", file=code)
print("", file=code) # print("success_output_folder = JAV_output", file=code)
print("[proxy]",file=code) # print("", file=code)
print("proxy=127.0.0.1:1081",file=code) # print("[proxy]",file=code)
print("timeout=10", file=code) # print("proxy=127.0.0.1:1081",file=code)
print("retry=3", file=code) # print("timeout=10", file=code)
print("", file=code) # print("retry=3", file=code)
print("[Name_Rule]", file=code) # print("", file=code)
print("location_rule=actor+'/'+number",file=code) # print("[Name_Rule]", file=code)
print("naming_rule=number+'-'+title",file=code) # print("location_rule=actor+'/'+number",file=code)
print("", file=code) # print("naming_rule=number+'-'+title",file=code)
print("[update]",file=code) # print("", file=code)
print("update_check=1",file=code) # print("[update]",file=code)
print("", file=code) # print("update_check=1",file=code)
print("[media]", file=code) # print("", file=code)
print("media_warehouse=emby", file=code) # print("[media]", file=code)
print("#emby plex kodi", file=code) # print("media_warehouse=emby", file=code)
print("", file=code) # print("#emby plex kodi", file=code)
print("[escape]", file=code) # print("", file=code)
print("literals=\\", file=code) # print("[escape]", file=code)
print("", file=code) # print("literals=\\", file=code)
print("[movie_location]", file=code) # print("", file=code)
print("path=", file=code) # print("[movie_location]", file=code)
print("", file=code) # print("path=", file=code)
print('.',end='') # print("", file=code)
time.sleep(2) # print('.',end='')
print('.') # time.sleep(2)
print('[+]config.ini: created!') # print('.')
print('[+]Please restart the program!') # print('[+]config.ini: created!')
time.sleep(4) # print('[+]Please restart the program!')
os._exit(0) # time.sleep(4)
try: # os._exit(0)
config.read(config_file, encoding='UTF-8') # try:
except: # config.read(config_file, encoding='UTF-8')
print('[-]Config.ini read failed! Please use the offical file!') # except:
# print('[-]Config.ini read failed! Please use the offical file!')
def get_network_settings():
try: config = ConfigApp()
proxy = config["proxy"]["proxy"]
timeout = int(config["proxy"]["timeout"])
retry_count = int(config["proxy"]["retry"]) def get_network_settings():
assert timeout > 0 try:
assert retry_count > 0 proxy = config.proxy
except: timeout = int(config.timeout)
raise ValueError("[-]Proxy config error! Please check the config.") retry_count = int(config.retry)
return proxy, timeout, retry_count assert timeout > 0
assert retry_count > 0
def getDataState(json_data): # 元数据获取失败检测 except:
if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null': raise ValueError("[-]Proxy config error! Please check the config.")
return 0 return proxy, timeout, retry_count
else:
return 1 def getDataState(json_data): # 元数据获取失败检测
if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null':
def ReadMediaWarehouse(): return 0
return config['media']['media_warehouse'] else:
return 1
def UpdateCheckSwitch():
check=str(config['update']['update_check']) def ReadMediaWarehouse():
if check == '1': return config.media_server
return '1'
elif check == '0': def UpdateCheckSwitch():
return '0' check=str(config.update_check)
elif check == '': if check == '1':
return '0' return '1'
elif check == '0':
def getXpathSingle(htmlcode,xpath): return '0'
html = etree.fromstring(htmlcode, etree.HTMLParser()) elif check == '':
result1 = str(html.xpath(xpath)).strip(" ['']") return '0'
return result1
def getXpathSingle(htmlcode,xpath):
def get_html(url,cookies = None):#网页请求核心 html = etree.fromstring(htmlcode, etree.HTMLParser())
proxy, timeout, retry_count = get_network_settings() result1 = str(html.xpath(xpath)).strip(" ['']")
i = 0 return result1
while i < retry_count:
try: def get_html(url,cookies = None):#网页请求核心
if not proxy == '': proxy, timeout, retry_count = get_network_settings()
proxies = {"http": "http://" + proxy,"https": "https://" + proxy} i = 0
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'} print(url)
getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies) while i < retry_count:
getweb.encoding = 'utf-8' try:
return getweb.text if not proxy == '':
else: proxies = {"http": proxy, "https": proxy}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies) getweb = requests.get(str(url), headers=headers, timeout=timeout, proxies=proxies, cookies=cookies)
getweb.encoding = 'utf-8' getweb.encoding = 'utf-8'
return getweb.text return getweb.text
except: else:
i += 1 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
print('[-]Connect retry '+str(i)+'/'+str(retry_count)) getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
print('[-]Connect Failed! Please check your Proxy or Network!') getweb.encoding = 'utf-8'
return getweb.text
except Exception as e:
def post_html(url: str, query: dict) -> requests.Response: print(e)
proxy, timeout, retry_count = get_network_settings() i += 1
print('[-]Connect retry '+str(i)+'/'+str(retry_count))
if proxy: print('[-]Connect Failed! Please check your Proxy or Network!')
proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
else:
proxies = {}
for i in range(retry_count):
try:
result = requests.post(url, data=query, proxies=proxies)
return result
except requests.exceptions.ProxyError:
print("[-]Connect retry {}/{}".format(i+1, retry_count))
print("[-]Connect Failed! Please check your Proxy or Network!")

View File

@ -1,162 +1,416 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import glob import glob
import os import os
import time import time
import re import fuckit
from ADC_function import * from tenacity import retry, stop_after_delay, wait_fixed
from core import * import json
import json import shutil
import shutil import itertools
from configparser import ConfigParser import argparse
import argparse from pathlib import Path
from core import *
def UpdateCheck(version): from ConfigApp import ConfigApp
if UpdateCheckSwitch() == '1': from PathNameProcessor import PathNameProcessor
html2 = get_html('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/update_check.json')
html = json.loads(str(html2)) # TODO 封装聚合解耦CORE
# TODO (学习)统一依赖管理工具
if not version == html['version']: # TODO 不同媒体服务器尽量兼容统一一种元数据 如nfo 海报等embyjellyfinplex
print('[*] * New update ' + html['version'] + ' *') # TODO 字幕整理功能 文件夹中读取所有字幕 并提番号放入对应缓存文件夹中TEMP
print('[*] ↓ Download ↓')
print('[*] ' + html['download']) config = ConfigApp()
print('[*]======================================================')
else:
print('[+]Update Check disabled!') def safe_list_get(list_in, idx, default=None):
"""
def argparse_get_file(): 数组安全取值
parser = argparse.ArgumentParser() :param list_in:
parser.add_argument("file", default='',nargs='?', help="Write the file path on here") :param idx:
args = parser.parse_args() :param default:
if args.file == '': :return:
return '' """
else: try:
return args.file return list_in[idx]
except IndexError:
def movie_lists(escape_folder): return default
escape_folder = re.split('[,]', escape_folder)
total = []
file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', ] def UpdateCheck(version):
file_root = os.getcwd() if UpdateCheckSwitch() == '1':
for root, dirs, files in os.walk(file_root): html2 = get_html('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/update_check.json')
flag_escape = 0 html = json.loads(str(html2))
for folder in escape_folder:
if folder in root: if not version == html['version']:
flag_escape = 1 print('[*] * New update ' + html['version'] + ' *')
break print('[*] ↓ Download ↓')
if flag_escape == 1: print('[*] ' + html['download'])
continue print('[*]======================================================')
for f in files: else:
if os.path.splitext(f)[1] in file_type: print('[+]Update Check disabled!')
path = os.path.join(root, f)
path = path.replace(file_root, '.')
total.append(path) def argparse_get_file():
return total parser = argparse.ArgumentParser()
parser.add_argument("file", default='', nargs='?', help="Write the file path on here")
args = parser.parse_args()
def CreatFailedFolder(failed_folder): if args.file == '':
if not os.path.exists(failed_folder + '/'): # 新建failed文件夹 return ''
try: else:
os.makedirs(failed_folder + '/') return args.file
except:
print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)")
os._exit(0) def movie_lists(escape_folders):
escape_folders = re.split('[,]', escape_folders)
total = []
def CEF(path):
try: for root, dirs, files in os.walk(config.search_folder):
files = os.listdir(path) # 获取路径下的子文件(夹)列表 if root in escape_folders:
for file in files: continue
os.removedirs(path + '/' + file) # 删除这个空文件夹 for file in files:
print('[+]Deleting empty folder', path + '/' + file) if re.search(PathNameProcessor.pattern_of_file_name_suffixes, file, re.IGNORECASE):
except: path = os.path.join(root, file)
a = '' total.append(path)
return total
def getNumber(filepath,absolute_path = False):
if absolute_path == True: # def CEF(path):
filepath=filepath.replace('\\','/') # try:
file_number = str(re.findall(r'(.+?)\.', str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip("['']").replace('_', '-') # files = os.listdir(path) # 获取路径下的子文件(夹)列表
return file_number # for file in files:
if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 # os.removedirs(path + '/' + file) # 删除这个空文件夹
filepath = filepath.replace("_", "-") # print('[+]Deleting empty folder', path + '/' + file)
filepath.strip('22-sht.me').strip('-HD').strip('-hd') # except:
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 # a = ''
if 'FC2' or 'fc2' in filename: #
filename = filename.replace('-PPV', '').replace('PPV-', '').replace('FC2PPV-','FC2-').replace('FC2PPV_','FC2-')
file_number = re.search(r'\w+-\w+', filename, re.A).group()
return file_number def get_numbers(paths):
else: # 提取不含减号-的番号FANZA CID """提取对应路径的番号+集数"""
try:
return str(re.findall(r'(.+?)\.', str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip("['']").replace('_', '-') def get_number(filepath, absolute_path=False):
except: """
return re.search(r'(.+?)\.', filepath)[0] 获取番号集数
:param filepath:
:param absolute_path:
if __name__ == '__main__': :return:
version = '2.8.2' """
config_file = 'config.ini' name = filepath.upper() # 转大写
config = ConfigParser() if absolute_path:
config.read(config_file, encoding='UTF-8') name = name.replace('\\', '/')
success_folder = config['common']['success_output_folder'] # 移除干扰字段
failed_folder = config['common']['failed_output_folder'] # 失败输出目录 name = PathNameProcessor.remove_distractions(name)
escape_folder = config['escape']['folders'] # 多级目录刮削需要排除的目录 # 抽取 文件路径中可能存在的尾部集数,和抽取尾部集数的后的文件路径
print('[*]================== AV Data Capture ===================') suffix_episode, name = PathNameProcessor.extract_suffix_episode(name)
print('[*] Version ' + version) # 抽取 文件路径中可能存在的 番号后跟随的集数 和 处理后番号
print('[*]======================================================') episode_behind_code, code_number = PathNameProcessor.extract_code(name)
# 无番号 则设置空字符
UpdateCheck(version) code_number = code_number if code_number else ''
CreatFailedFolder(failed_folder) # 优先取尾部集数,无则取番号后的集数(几率低),都无则为空字符
os.chdir(os.getcwd()) episode = suffix_episode if suffix_episode else episode_behind_code if episode_behind_code else ''
movie_list = movie_lists(escape_folder)
return code_number, episode
#========== 野鸡番号拖动 ==========
number_argparse=argparse_get_file() maps = {}
if not number_argparse == '': for path in paths:
print("[!]Making Data for [" + number_argparse + "], the number is [" + getNumber(number_argparse,absolute_path = True) + "]") number, episode = get_number(path)
core_main(number_argparse, getNumber(number_argparse,absolute_path = True)) maps[path] = (number, episode)
print("[*]======================================================")
CEF(success_folder) return maps
CEF(failed_folder)
print("[+]All finished!!!")
input("[+][+]Press enter key exit, you can check the error messge before you exit.") def create_folder(paths):
os._exit(0) for path_to_make in paths:
# ========== 野鸡番号拖动 ========== if path_to_make:
try:
count = 0 os.makedirs(path_to_make)
count_all = str(len(movie_list)) except FileExistsError as e:
print('[+]Find', count_all, 'movies') # name = f'{folder=}'.split('=')[0].split('.')[-1]
if config['common']['soft_link'] == '1': print(path_to_make + " 已经存在")
print('[!] --- Soft link mode is ENABLE! ----') pass
for i in movie_list: # 遍历电影列表 交给core处理 except Exception as exception:
count = count + 1 print('! 创建文件夹 ' + path_to_make + ' 失败,文件夹路径错误或权限不够')
percentage = str(count / int(count_all) * 100)[:4] + '%' raise exception
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -') else:
# print("[!]Making Data for [" + i + "], the number is [" + getNumber(i) + "]") raise Exception('!创建的文件夹路径为空,请确认')
# core_main(i, getNumber(i))
# print("[*]======================================================")
try: if __name__ == '__main__':
print("[!]Making Data for [" + i + "], the number is [" + getNumber(i) + "]") version = '2.8.2'
core_main(i, getNumber(i))
print("[*]======================================================") print('[*]================== AV Data Capture ===================')
except: # 番号提取异常 print('[*] Version ' + version)
print('[-]' + i + ' Cannot catch the number :') print('[*]======================================================')
if config['common']['soft_link'] == '1':
print('[-]Link', i, 'to failed folder') # UpdateCheck(version)
os.symlink(i, str(os.getcwd()) + '/' + failed_folder + '/')
else: CreatFailedFolder(config.failed_folder)
try: os.chdir(os.getcwd())
print('[-]Move ' + i + ' to failed folder')
shutil.move(i, str(os.getcwd()) + '/' + failed_folder + '/') # 创建文件夹
except FileExistsError: create_folder([config.failed_folder, config.search_folder, config.temp_folder])
print('[!]File exists in failed!')
except: # temp 文件夹中infos放 番号json信息pics中放图片信息
print('[+]skip') path_infos = config.temp_folder + '/infos'
continue path_pics = config.temp_folder + '/pics'
CEF(success_folder) create_folder([path_infos, path_pics])
CEF(failed_folder)
print("[+]All finished!!!") # 遍历搜索目录下所有视频的路径
input("[+][+]Press enter key exit, you can check the error messge before you exit.") movie_list = movie_lists(config.escape_folder)
# 以下是从文本中提取测试的数据
# f = open('TestPathNFO.txt', 'r')
# f = open('TestPathSpecial.txt', 'r')
# movie_list = [line[:-1] for line in f.readlines()]
# f.close()
# 获取 番号,集数,路径 的字典->list
code_ep_paths = [[codeEposode[0], codeEposode[1], path] for path, codeEposode in get_numbers(movie_list).items()]
[print(i) for i in code_ep_paths]
# 按番号分组片子列表(重点),用于寻找相同番号的片子
'''
这里利用pandas分组 "https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html"
'''
# # 设置打印时显示所有列
# pd.set_option('display.max_columns', None)
# # 显示所有行
# pd.set_option('display.max_rows', None)
# # 设置value的显示长度为100默认为50
# pd.set_option('max_colwidth', 30)
# # 创建框架
# df = pd.DataFrame(code_ep_paths, columns=('code', 'ep', 'path'))
# # 以番号分组
# groupedCode_code_ep_paths = df.groupby(['code'])
# # print(df.groupby(['code', 'ep']).describe().unstack())
# grouped_code_ep = df.groupby(['code', 'ep'])['path']
#
sorted_code_list = sorted(code_ep_paths, key=lambda code_ep_path: code_ep_path[0])
group_code_list = itertools.groupby(sorted_code_list, key=lambda code_ep_path: code_ep_path[0])
def group_code_list_to_dict(group_code_list):
data_dict = {}
for code, code_ep_path_group in group_code_list:
code_ep_path_list = list(code_ep_path_group)
eps_of_code = {}
group_ep_list = itertools.groupby(code_ep_path_list, key=lambda code_ep_path: code_ep_path[1])
for ep, group_ep_group in group_ep_list:
group_ep_list = list(group_ep_group)
eps_of_code[ep] = [code_ep_path[2] for code_ep_path in group_ep_list]
data_dict[code] = eps_of_code
return data_dict
def print_same_code_ep_path(data_dict_in):
for code_in in data_dict_in:
ep_path_list = data_dict_in[code_in]
if len(ep_path_list) > 1:
print('--' * 60)
print("|" + (code_in if code_in else 'unknown') + ":")
# group_ep_list = itertools.groupby(code_ep_path_list.items(), key=lambda code_ep_path: code_ep_path[0])
for ep in ep_path_list:
path_list = ep_path_list[ep]
print('--' * 12)
ep = ep if ep else ' '
if len(path_list) == 1:
print('| 集数:' + ep + ' 文件: ' + path_list[0])
else:
print('| 集数:' + ep + ' 文件: ')
for path in path_list:
print('| ' + path)
else:
pass
# 分好组的数据 {code:{ep:[path]}}
data_dict_groupby_code_ep = group_code_list_to_dict(group_code_list)
print('--' * 100)
print("找到影片数量:" + str(len(movie_list)))
print("合计番号数量:" + str(len(data_dict_groupby_code_ep)) + " (多个相同番号的影片只统计一个,不能识别的番号 都统一为'unknown')")
print('Warning:!!!! 以下为相同番号的电影明细')
print('' + '--' * 80)
print_same_code_ep_path(data_dict_groupby_code_ep)
print('' + '--' * 80)
isContinue = input('任意键继续? N 退出 \n')
if isContinue.strip(' ') == "N":
exit(1)
# ========== 野鸡番号拖动 ==========
# number_argparse = argparse_get_file()
# if not number_argparse == '':
# print("[!]Making Data for [" + number_argparse + "], the number is [" + getNumber(number_argparse,
# absolute_path=True) + "]")
# nfo = core_main(number_argparse, getNumber(number_argparse, absolute_path=True))
# print("[*]======================================================")
# CEF(config.success_folder)
# CEF(config.failed_folder)
# print("[+]All finished!!!")
# input("[+][+]Press enter key exit, you can check the error messge before you exit.")
# os._exit(0)
# ========== 野鸡番号拖动 ==========
def download_code_infos(code_list, is_read_cache=True):
"""
遍历按番号分组的集合刮取番号信息并缓存
:param is_read_cache: 是否读取缓存数据
:param code_list:
:return: {code:nfo}
"""
count_all_grouped = len(code_list)
count = 0
code_info_dict = {}
for code in code_list:
count = count + 1
percentage = str(count / int(count_all_grouped) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + str(count_all_grouped) + '] -')
try:
print("[!]搜刮数据 [" + code + "]")
if code:
# 创建番号的文件夹
file_path = path_infos + '/' + code + '.json'
nfo = {}
# 读取缓存信息,如果没有则联网搜刮
path = Path(file_path)
if is_read_cache and (path.exists() and path.is_file() and path.stat().st_size > 0):
print('找到缓存信息')
with open(file_path) as fp:
nfo = json.load(fp)
else:
# 核心功能 - 联网抓取信息字典
print('联网搜刮')
nfo = core_main(code)
print('正在写入', end='')
# 把缓存信息写入缓存文件夹中,有时会设备占用而失败,重试即可
@retry(stop=stop_after_delay(3), wait=wait_fixed(2))
def read_file():
with open(file_path, 'w') as fp:
json.dump(nfo, fp)
read_file()
print('完成!')
# 将番号信息放入字典
code_info_dict[code] = nfo
print("[*]======================================================")
except Exception as e: # 番号的信息获取失败
code_info_dict[code] = ''
print("找不到信息:" + code + ',Reason:' + str(e))
# if config.soft_link:
# print('[-]Link', file_path_name, 'to failed folder')
# os.symlink(file_path_name, config.failed_folder + '/')
# else:
# try:
# print('[-]Move ' + file_path_name + ' to failed folder:' + config.failed_folder)
# shutil.move(file_path_name, config.failed_folder + '/')
# except FileExistsError:
# print('[!]File exists in failed!')
# except:
# print('[+]skip')
continue
return code_info_dict
print('----------------------------------')
code_infos = download_code_infos(data_dict_groupby_code_ep)
print("----未找到番号数据的番号----")
print([print(code) for code in code_infos if code_infos[code] == ''])
print("-------------------------")
def download_images_of_nfos(code_info_dict):
"""
遍历番号信息下载番号电影的海报图片
:param code_info_dict:
:return: 无图片的信息的番号
"""
code_list_empty_image = []
for code in code_info_dict:
nfo = code_info_dict[code]
if len(nfo.keys()) == 0:
code_list_empty_image.append(code)
continue
code_pics_folder_to_save = path_pics + '/' + code
# 1 创建 番号文件夹
os.makedirs(code_pics_folder_to_save, exist_ok=True)
# 下载缩略图
if nfo['imagecut'] == 3: # 3 是缩略图
path = Path(code_pics_folder_to_save + '/' + 'thumb.png')
if path.exists() and path.is_file() and path.stat().st_size > 0:
print(code + ':缩略图已有缓存')
else:
print(code + ':缩略图下载中...')
download_file(nfo['cover_small'], code_pics_folder_to_save, 'thumb.png')
print(code + ':缩略图下载完成')
# 下载海报
path = Path(code_pics_folder_to_save + '/' + 'poster.png')
if path.exists() and path.is_file() and path.stat().st_size > 0:
print(code + ':海报已有缓存')
else:
print(code + ':海报下载中...')
download_file(nfo['cover'], code_pics_folder_to_save, 'poster.png')
print(code + ':海报下载完成')
return code_list_empty_image
code_list_empty = download_images_of_nfos(code_infos)
print("----未找到集数的番号----")
print([print(code) for code in code_list_empty])
print("------搜刮未找到集数的番号------")
code_infos_of_no_ep = download_code_infos(code_list_empty, is_read_cache=False)
print("----还是未找到番号数据的番号----")
print([print(code) for code in code_infos_of_no_ep if code_infos_of_no_ep[code] == ''])
print("----------------------")
# 开始操作
# # 2 创建缩略图海报
# if nfo['imagecut'] == 3: # 3 是缩略图
# download_cover_file(nfo['cover_small'], code, code_pics_folder_to_save)
# # 3 创建图
# download_image(nfo['cover'], code, code_pics_folder_to_save)
# # 4 剪裁
# crop_image(nfo['imagecut'], code, code_pics_folder_to_save)
# # 5 背景图
# copy_images_to_background_image(code, code_pics_folder_to_save)
# 6 创建 mame.nfo(不需要需要时从infos中josn文件转为nfo文件)
# make_nfo_file(nfo, code, temp_path_to_save)
# 相同番号处理:按集数添加-CD[X];视频格式 and 大小 分;
# TODO 方式1 刮削添加nfo封面内容截图等
# 6 创建 mame.nfo(不需要需要时从infos中josn文件转为nfo文件)
make_nfo_file(nfo, code, temp_path_to_save)
# TODO 方式2 整理:按规则移动影片,字幕 到 演员,发行商,有无🐎 等
# if config.program_mode == '1':
# if multi_part == 1:
# number += part # 这时number会被附加上CD1后缀
# smallCoverCheck(path, number, imagecut, json_data['cover_small'], c_word, option, filepath, config.failed_folder) # 检查小封面
# imageDownload(option, json_data['cover'], number, c_word, path, multi_part, filepath, config.failed_folder) # creatFoder会返回番号路径
# cutImage(option, imagecut, path, number, c_word) # 裁剪图
# copyRenameJpgToBackdrop(option, path, number, c_word)
# PrintFiles(option, path, c_word, json_data['naming_rule'], part, cn_sub, json_data, filepath, config.failed_folder, tag) # 打印文件 .nfo
# pasteFileToFolder(filepath, path, number, c_word) # 移动文件
# # =======================================================================整理模式
# elif config.program_mode == '2':
# pasteFileToFolder_mode2(filepath, path, multi_part, number, part, c_word) # 移动文件
# CEF(config.success_folder)
# CEF(config.failed_folder)
print("[+]All finished!!!")
input("[+][+]Press enter key exit, you can check the error message before you exit.")

28
ConfigApp.py Executable file
View File

@ -0,0 +1,28 @@
from configparser import ConfigParser
from MediaServer import MediaServer
class ConfigApp:
def __init__(self):
config_file = 'config.ini'
config = ConfigParser()
config.read(config_file, encoding='UTF-8')
self.success_folder = config['common']['success_output_folder']
self.failed_folder = config['common']['failed_output_folder'] # 失败输出目录
self.escape_folder = config['escape']['folders'] # 多级目录刮削需要排除的目录
self.search_folder = config['common']['search_folder'] # 搜索路径
self.temp_folder = config['common']['temp_folder'] # 临时资源路径
self.soft_link = (config['common']['soft_link'] == 1)
# self.escape_literals = (config['escape']['literals'] == 1)
self.naming_rule = config['Name_Rule']['naming_rule']
self.location_rule = config['Name_Rule']['location_rule']
self.proxy = config['proxy']['proxy']
self.timeout = float(config['proxy']['timeout'])
self.retry = int(config['proxy']['retry'])
self.media_server = MediaServer[config['media']['media_warehouse']]
self.update_check = config['update']['update_check']
self.debug_mode = config['debug_mode']['switch']

0
LICENSE Normal file → Executable file
View File

View File

@ -0,0 +1,19 @@
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
print(df)
groupedA = df.groupby('A').describe()
groupedAB = df.groupby(['A', 'B'])['C']
print('---'*18)
for a, b in groupedAB:
print('--'*18)
print(a)
print('-' * 18)
print(b)

View File

@ -0,0 +1,38 @@
import pandas as pd
import numpy as np
'''
python数据处理三剑客之一pandas
https://pandas.pydata.org/pandas-docs/stable/user_guide
https://www.pypandas.cn/docs/getting_started/10min.html
'''
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(dates)
print(df)
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
print(df2)
print(df2.dtypes)
print(df.head())
print(df.tail(5))
print(df.index)
print(df.columns)
df.describe() # 统计数据摘要
df.T # index columns互转
df.sort_index(axis=1, ascending=False) # 排序axis=1 是columnsaxis=1 是index
df.sort_values(by='B') # 按值排序 按B列中的值排序
# 切行
df.A
df['A']
# 切行
df['20130102':'20130104']
df[0:3]

28
MediaServer.py Normal file
View File

@ -0,0 +1,28 @@
from enum import Enum, auto
class MediaServer(Enum):
EMBY = auto()
PLEX = auto()
KODI = auto()
# media = EMBY
#
# def __init__(self, arg):
# self = [e for e in MediaServer if arg.upper() == self.name]
def poster_name(self, name):
if self == MediaServer.EMBY: # 保存[name].png
return name + '.png'
elif self == MediaServer.KODI: # 保存[name]-poster.jpg
return name + '-poster.jpg'
elif self == MediaServer.PLEX: # 保存 poster.jpg
return 'poster.jpg'
def image_name(self, name):
if self == MediaServer.EMBY: # name.jpg
return name + '.jpg'
elif self == MediaServer.KODI: # [name]-fanart.jpg
return name + '-fanart.jpg'
elif self == MediaServer.PLEX: # fanart.jpg
return 'fanart.jpg'

3
Metadate.py Normal file
View File

@ -0,0 +1,3 @@
from addict import Dict
# class Metadata:

115
PathNameProcessor.py Normal file
View File

@ -0,0 +1,115 @@
import re
import fuckit
class PathNameProcessor:
# 类变量
pattern_of_file_name_suffixes = r'.(mov|mp4|avi|rmvb|wmv|mov|mkv|flv|ts|m2ts)$'
# def __init__(self):
@staticmethod
def remove_distractions(origin_name):
"""移除干扰项"""
# 移除文件类型后缀
origin_name = re.sub(PathNameProcessor.pattern_of_file_name_suffixes, '', origin_name, 0, re.IGNORECASE)
# 处理包含减号-和_的番号'/-070409_621'
origin_name = re.sub(r'[-_~*# ]', "-", origin_name, 0)
origin_name = re.sub(r'(Carib)(bean)?', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'(1pondo)', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'(tokyo)[-. ]?(hot)', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'Uncensored', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'JAV', '-', origin_name, 0, re.IGNORECASE)
# 移除干扰字段
origin_name = origin_name.replace('22-sht.me', '-')
# 去除文件名中时间 1970-2099年 月 日
pattern_of_date = r'(?:-)(19[789]\d|20\d{2})(-?(0\d|1[012])-?(0[1-9]|[12]\d|3[01])?)?[-.]'
# 移除字母开头 清晰度相关度 字符
pattern_of_resolution_alphas = r'(?<![a-zA-Z])(SD|((F|U)|(Full|Ultra)[-_*. ~]?)?HD|BD|(blu[-_*. ~]?ray)|[hx]264|[hx]265|HEVC)'
# 数字开头的 清晰度相关度 字符
pattern_of_resolution_numbers = r'(?<!\d)(4K|(1080[ip])|(720p)|(480p))'
origin_name = re.sub(pattern_of_resolution_alphas, "-", origin_name, 0, re.IGNORECASE)
origin_name = re.sub(pattern_of_resolution_numbers, "-", origin_name, 0, re.IGNORECASE)
origin_name = re.sub(pattern_of_date, "-", origin_name)
if 'FC2' or 'fc2' in origin_name:
origin_name = origin_name.replace('-PPV', '').replace('PPV-', '').replace('FC2PPV-', 'FC2-').replace(
'FC2PPV_', 'FC2-')
# 移除连续重复无意义符号-
origin_name = re.sub(r"([-.])(\1+)", r"\1", origin_name)
# 移除尾部无意义符号 方便识别剧集数
origin_name = re.sub(r'[-.]+$', "", origin_name)
return origin_name
@staticmethod
def extract_suffix_episode(origin_name):
""" 提取尾部集数号 123ABC(只识别一位) part1 ipz.A CD1 NOP019B.HD.wmv"""
episode = None
with fuckit:
# 零宽断言获取尾部数字 剧集数 123
pattern_episodes_number = r'(?<!\d)\d$'
episode = re.findall(pattern_episodes_number, origin_name)[-1]
origin_name = re.sub(pattern_episodes_number, "", origin_name)
with fuckit:
# 零宽断言获取尾部字幕 剧集数 abc
pattern_episodes_alpha = r'(?<![a-zA-Z])[a-zA-Z]$'
episode = re.findall(pattern_episodes_alpha, origin_name)[-1]
origin_name = re.sub(pattern_episodes_alpha, "", origin_name)
return episode, origin_name
@staticmethod
def extract_code(origin_name):
"""
提取集数和 规范过的番号
"""
name = None
episode = None
with fuckit:
# 找到含- 或不含-的 番号1. 数字+数字 2. 字母+数字
name = re.findall(r'(?:\d{2,}-\d{2,})|(?:[A-Z]+-?[A-Z]*\d{2,})', origin_name)[-1]
episode = PathNameProcessor.extract_episode_behind_code(origin_name, name)
# 将未-的名字处理加上 -
if not ('-' in name):
# 无减号-的番号,尝试分段加上-
# 非贪婪匹配非特殊字符零宽断言后数字至少2位连续,ipz221.part2 mide072hhb ,n1180
with fuckit:
name = re.findall(r'[a-zA-Z]+\d{2,}', name)[-1]
# 比如MCDV-47 mcdv-047 是2个不一样的片子但是 SIVR-00008 和 SIVR-008是同同一部,但是heyzo除外,heyzo 是四位数
if "heyzo" not in name.lower():
name = re.sub(r'([a-zA-Z]{2,})(?:0*?)(\d{2,})', r'\1-\2', name)
# 正则取含-的番号 【字母-[字母]数字】,数字必定大于2位 番号的数组的最后的一个元素
with fuckit:
# MKBD_S03-MaRieS
name = re.findall(r'[a-zA-Z|\d]+-[a-zA-Z|\d]*\d{2,}', name)[-1]
# 107NTTR-037 -> NTTR-037 , SIVR-00008 -> SIVR-008 但是heyzo除外
if "heyzo" not in name.lower():
searched = re.search(r'([a-zA-Z]{2,})-(?:0*)(\d{3,})', name)
if searched:
name = '-'.join(searched.groups())
return episode, name
@staticmethod
def extract_episode_behind_code(origin_name, code):
episode = None
with fuckit:
# 零宽断言获取尾部字幕 剧集数 abc123
result_dict = re.search(rf'(?<={code})-?((?P<alpha>([A-Z](?![A-Z])))|(?P<num>\d(?!\d)))', origin_name,
re.I).groupdict()
episode = result_dict['alpha'] or result_dict['num']
return episode
def safe_list_get(list_in, idx, default):
try:
return list_in[idx]
except IndexError:
return default

19
Pipfile Normal file
View File

@ -0,0 +1,19 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
bs4 = "*"
tenacity = "*"
fuckit = "*"
requests = "*"
image = "*"
lazyxml = {editable = true,git = "https://github.com/waynedyck/lazyxml.git",ref = "python-3-conversion_wd1"}
lxml = "*"
pyquery = "*"
[requires]
python_version = "3.8"

246
Pipfile.lock generated Normal file
View File

@ -0,0 +1,246 @@
{
"_meta": {
"hash": {
"sha256": "15bf3c6af3ec315358a0217481a13285f95fc742bb5db8a1f934e0d1c3d7d5e2"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.8"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"asgiref": {
"hashes": [
"sha256:5ee950735509d04eb673bd7f7120f8fa1c9e2df495394992c73234d526907e17",
"sha256:7162a3cb30ab0609f1a4c95938fd73e8604f63bdba516a7f7d64b83ff09478f0"
],
"markers": "python_version >= '3.5'",
"version": "==3.3.1"
},
"beautifulsoup4": {
"hashes": [
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
],
"version": "==4.9.3"
},
"bs4": {
"hashes": [
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
],
"index": "pypi",
"version": "==0.0.1"
},
"certifi": {
"hashes": [
"sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
"sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
],
"version": "==2020.12.5"
},
"chardet": {
"hashes": [
"sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
"sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.0.0"
},
"cssselect": {
"hashes": [
"sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
"sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.1.0"
},
"django": {
"hashes": [
"sha256:2d78425ba74c7a1a74b196058b261b9733a8570782f4e2828974777ccca7edf7",
"sha256:efa2ab96b33b20c2182db93147a0c3cd7769d418926f9e9f140a60dca7c64ca9"
],
"markers": "python_version >= '3.6'",
"version": "==3.1.5"
},
"fuckit": {
"hashes": [
"sha256:059488e6aa2053da9db5eb5101e2498f608314da5118bf2385acb864568ccc25"
],
"index": "pypi",
"version": "==4.8.1"
},
"idna": {
"hashes": [
"sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
"sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.10"
},
"image": {
"hashes": [
"sha256:baa2e09178277daa50f22fd6d1d51ec78f19c12688921cb9ab5808743f097126"
],
"index": "pypi",
"version": "==1.5.33"
},
"lazyxml": {
"editable": true,
"git": "https://github.com/waynedyck/lazyxml.git",
"ref": "f42ea4a4febf4c1e120b05d6ca9cef42556a75d5"
},
"lxml": {
"hashes": [
"sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
"sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
"sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
"sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
"sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
"sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
"sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
"sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
"sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
"sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
"sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
"sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
"sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
"sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
"sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
"sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
"sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
"sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
"sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
"sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
"sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
"sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
"sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
"sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
"sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
"sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
"sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
"sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
"sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
"sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
"sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
"sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
"sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
"sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
"sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
"sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
"sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
],
"index": "pypi",
"version": "==4.6.2"
},
"pillow": {
"hashes": [
"sha256:165c88bc9d8dba670110c689e3cc5c71dbe4bfb984ffa7cbebf1fac9554071d6",
"sha256:1d208e670abfeb41b6143537a681299ef86e92d2a3dac299d3cd6830d5c7bded",
"sha256:22d070ca2e60c99929ef274cfced04294d2368193e935c5d6febfd8b601bf865",
"sha256:2353834b2c49b95e1313fb34edf18fca4d57446675d05298bb694bca4b194174",
"sha256:39725acf2d2e9c17356e6835dccebe7a697db55f25a09207e38b835d5e1bc032",
"sha256:3de6b2ee4f78c6b3d89d184ade5d8fa68af0848f9b6b6da2b9ab7943ec46971a",
"sha256:47c0d93ee9c8b181f353dbead6530b26980fe4f5485aa18be8f1fd3c3cbc685e",
"sha256:5e2fe3bb2363b862671eba632537cd3a823847db4d98be95690b7e382f3d6378",
"sha256:604815c55fd92e735f9738f65dabf4edc3e79f88541c221d292faec1904a4b17",
"sha256:6c5275bd82711cd3dcd0af8ce0bb99113ae8911fc2952805f1d012de7d600a4c",
"sha256:731ca5aabe9085160cf68b2dbef95fc1991015bc0a3a6ea46a371ab88f3d0913",
"sha256:7612520e5e1a371d77e1d1ca3a3ee6227eef00d0a9cddb4ef7ecb0b7396eddf7",
"sha256:7916cbc94f1c6b1301ac04510d0881b9e9feb20ae34094d3615a8a7c3db0dcc0",
"sha256:81c3fa9a75d9f1afafdb916d5995633f319db09bd773cb56b8e39f1e98d90820",
"sha256:887668e792b7edbfb1d3c9d8b5d8c859269a0f0eba4dda562adb95500f60dbba",
"sha256:93a473b53cc6e0b3ce6bf51b1b95b7b1e7e6084be3a07e40f79b42e83503fbf2",
"sha256:96d4dc103d1a0fa6d47c6c55a47de5f5dafd5ef0114fa10c85a1fd8e0216284b",
"sha256:a3d3e086474ef12ef13d42e5f9b7bbf09d39cf6bd4940f982263d6954b13f6a9",
"sha256:b02a0b9f332086657852b1f7cb380f6a42403a6d9c42a4c34a561aa4530d5234",
"sha256:b09e10ec453de97f9a23a5aa5e30b334195e8d2ddd1ce76cc32e52ba63c8b31d",
"sha256:b6f00ad5ebe846cc91763b1d0c6d30a8042e02b2316e27b05de04fa6ec831ec5",
"sha256:bba80df38cfc17f490ec651c73bb37cd896bc2400cfba27d078c2135223c1206",
"sha256:c3d911614b008e8a576b8e5303e3db29224b455d3d66d1b2848ba6ca83f9ece9",
"sha256:ca20739e303254287138234485579b28cb0d524401f83d5129b5ff9d606cb0a8",
"sha256:cb192176b477d49b0a327b2a5a4979552b7a58cd42037034316b8018ac3ebb59",
"sha256:cdbbe7dff4a677fb555a54f9bc0450f2a21a93c5ba2b44e09e54fcb72d2bd13d",
"sha256:cf6e33d92b1526190a1de904df21663c46a456758c0424e4f947ae9aa6088bf7",
"sha256:d355502dce85ade85a2511b40b4c61a128902f246504f7de29bbeec1ae27933a",
"sha256:d673c4990acd016229a5c1c4ee8a9e6d8f481b27ade5fc3d95938697fa443ce0",
"sha256:dc577f4cfdda354db3ae37a572428a90ffdbe4e51eda7849bf442fb803f09c9b",
"sha256:dd9eef866c70d2cbbea1ae58134eaffda0d4bfea403025f4db6859724b18ab3d",
"sha256:f50e7a98b0453f39000619d845be8b06e611e56ee6e8186f7f60c3b1e2f0feae"
],
"markers": "python_version >= '3.6'",
"version": "==8.1.0"
},
"pyquery": {
"hashes": [
"sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963",
"sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72"
],
"index": "pypi",
"version": "==1.4.3"
},
"pytz": {
"hashes": [
"sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4",
"sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"
],
"version": "==2020.5"
},
"requests": {
"hashes": [
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
"sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
],
"index": "pypi",
"version": "==2.25.1"
},
"six": {
"hashes": [
"sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
"sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.15.0"
},
"soupsieve": {
"hashes": [
"sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851",
"sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"
],
"markers": "python_version >= '3.0'",
"version": "==2.1"
},
"sqlparse": {
"hashes": [
"sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
"sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
],
"markers": "python_version >= '3.5'",
"version": "==0.4.1"
},
"tenacity": {
"hashes": [
"sha256:baed357d9f35ec64264d8a4bbf004c35058fad8795c5b0d8a7dc77ecdcbb8f39",
"sha256:e14d191fb0a309b563904bbc336582efe2037de437e543b38da749769b544d7f"
],
"index": "pypi",
"version": "==6.3.1"
},
"urllib3": {
"hashes": [
"sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
"sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.2"
}
},
"develop": {}
}

0
README.md Normal file → Executable file
View File

229
avsox.py → SiteSource/avsox.py Normal file → Executable file
View File

@ -1,115 +1,116 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'}) a = soup.find_all(attrs={'class': 'avatar-box'})
d = {} d = {}
for i in a: for i in a:
l = i.img['src'] l = i.img['src']
t = i.span.get_text() t = i.span.get_text()
p2 = {t: l} p2 = {t: l}
d.update(p2) d.update(p2)
return d return d
def getTitle(a): def getTitle(a):
try: try:
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '') return result.replace('/', '')
except: except:
return '' return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
soup = BeautifulSoup(a, 'lxml') soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'}) a = soup.find_all(attrs={'class': 'avatar-box'})
d = [] d = []
for i in a: for i in a:
d.append(i.span.get_text()) d.append(i.span.get_text())
return d return d
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1 return result1
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1 return result1
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1 return result1
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1 return result1
def getYear(release): def getYear(release):
try: try:
result = str(re.search('\d{4}',release).group()) result = str(re.search('\d{4}',release).group())
return result return result
except: except:
return release return release
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1 return result1
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result return result
def getCover_small(htmlcode): def getCover_small(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result return result
def getTag(a): # 获取演员 def getTag(a): # 获取演员
soup = BeautifulSoup(a, 'lxml') soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'genre'}) a = soup.find_all(attrs={'class': 'genre'})
d = [] d = []
for i in a: for i in a:
d.append(i.get_text()) d.append(i.get_text())
return d return d
def main(number): def main(number):
a = get_html('https://avsox.host/cn/search/' + number) url = 'https://avsox.host/cn/search/' + number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() a = get_html(url)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
if result1 == '' or result1 == 'null' or result1 == 'None': result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) if result1 == '' or result1 == 'null' or result1 == 'None':
print(a) a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() print(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
if result1 == '' or result1 == 'null' or result1 == 'None': result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) if result1 == '' or result1 == 'null' or result1 == 'None':
print(a) a = get_html('https://avsox.host/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() print(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
web = get_html(result1) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
soup = BeautifulSoup(web, 'lxml') web = get_html(result1)
info = str(soup.find(attrs={'class': 'row movie'})) soup = BeautifulSoup(web, 'lxml')
dic = { info = str(soup.find(attrs={'class': 'row movie'}))
'actor': getActor(web), dic = {
'title': getTitle(web).strip(getNum(web)), 'actor': getActor(web),
'studio': getStudio(info), 'title': getTitle(web).strip(getNum(web)),
'outline': '',# 'studio': getStudio(info),
'runtime': getRuntime(info), 'outline': '',#
'director': '', # 'runtime': getRuntime(info),
'release': getRelease(info), 'director': '', #
'number': getNum(info), 'release': getRelease(info),
'cover': getCover(web), 'number': getNum(info),
'cover_small': getCover_small(a), 'cover': getCover(web),
'imagecut': 3, 'cover_small': getCover_small(a),
'tag': getTag(web), 'imagecut': 3,
'label': getLabel(info), 'tag': getTag(web),
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), 'label': getLabel(info),
'actor_photo': getActorPhoto(web), 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
'website': result1, 'actor_photo': getActorPhoto(web),
'source': 'avsox.py', 'website': result1,
} 'source': 'avsox.py',
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') }
return js js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
#print(main('012717_472')) #print(main('012717_472'))

458
fanza.py → SiteSource/fanza.py Normal file → Executable file
View File

@ -1,229 +1,229 @@
#!/usr/bin/python3 #!/usr/bin/python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import json
import re import re
from lxml import etree from lxml import etree
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(text): def getTitle(text):
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[@id="title"]/text()')[0] result = html.xpath('//*[@id="title"]/text()')[0]
return result return result
def getActor(text): def getActor(text):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
result = ( result = (
str( str(
html.xpath( html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
) )
) )
.strip(" ['']") .strip(" ['']")
.replace("', '", ",") .replace("', '", ",")
) )
return result return result
def getStudio(text): def getStudio(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()" "//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/text()" "//td[contains(text(),'メーカー')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getRuntime(text): def getRuntime(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0] result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group() return re.search(r"\d+", str(result)).group()
def getLabel(text): def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getNum(text): def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()" "//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/text()" "//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search(r"\d{4}", getRelease).group()) result = str(re.search(r"\d{4}", getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(text): def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n") )[0].lstrip("\n")
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/text()" "//td[contains(text(),'発売日:')]/following-sibling::td/text()"
)[0].lstrip("\n") )[0].lstrip("\n")
return result return result
def getTag(text): def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
) )
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
) )
return result return result
def getCover(text, number): def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
cover_number = number cover_number = number
try: try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except: except:
# sometimes fanza modify _ to \u0005f for image id # sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number: if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f") cover_number = cover_number.replace("_", r"\u005f")
try: try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except: except:
# (TODO) handle more edge case # (TODO) handle more edge case
# print(html) # print(html)
# raise exception here, same behavior as before # raise exception here, same behavior as before
# people's major requirement is fetching the picture # people's major requirement is fetching the picture
raise ValueError("can not find image") raise ValueError("can not find image")
return result return result
def getDirector(text): def getDirector(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()" "//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0] )[0]
except: except:
result = html.xpath( result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()" "//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0] )[0]
return result return result
def getOutline(text): def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser()) html = etree.fromstring(text, etree.HTMLParser())
try: try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace( result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", "" "\n", ""
) )
if result == "": if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
"\n", "" "\n", ""
) )
except: except:
# (TODO) handle more edge case # (TODO) handle more edge case
# print(html) # print(html)
return "" return ""
return result return result
def main(number): def main(number):
# fanza allow letter + number + underscore, normalize the input here # fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789 # @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"): if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_") fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [ fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
] ]
chosen_url = "" chosen_url = ""
for url in fanza_urls: for url in fanza_urls:
chosen_url = url + fanza_search_number chosen_url = url + fanza_search_number
htmlcode = get_html(chosen_url) htmlcode = get_html(chosen_url)
if "404 Not Found" not in htmlcode: if "404 Not Found" not in htmlcode:
break break
if "404 Not Found" in htmlcode: if "404 Not Found" in htmlcode:
return json.dumps({"title": "",}) return json.dumps({"title": "",})
try: try:
# for some old page, the input number does not match the page # for some old page, the input number does not match the page
# for example, the url will be cid=test012 # for example, the url will be cid=test012
# but the hinban on the page is test00012 # but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions # so get the hinban first, and then pass it to following functions
fanza_hinban = getNum(htmlcode) fanza_hinban = getNum(htmlcode)
data = { data = {
"title": getTitle(htmlcode).strip(getActor(htmlcode)), "title": getTitle(htmlcode).strip(getActor(htmlcode)),
"studio": getStudio(htmlcode), "studio": getStudio(htmlcode),
"outline": getOutline(htmlcode), "outline": getOutline(htmlcode),
"runtime": getRuntime(htmlcode), "runtime": getRuntime(htmlcode),
"director": getDirector(htmlcode) if "anime" not in chosen_url else "", "director": getDirector(htmlcode) if "anime" not in chosen_url else "",
"actor": getActor(htmlcode) if "anime" not in chosen_url else "", "actor": getActor(htmlcode) if "anime" not in chosen_url else "",
"release": getRelease(htmlcode), "release": getRelease(htmlcode),
"number": fanza_hinban, "number": fanza_hinban,
"cover": getCover(htmlcode, fanza_hinban), "cover": getCover(htmlcode, fanza_hinban),
"imagecut": 1, "imagecut": 1,
"tag": getTag(htmlcode), "tag": getTag(htmlcode),
"label": getLabel(htmlcode), "label": getLabel(htmlcode),
"year": getYear( "year": getYear(
getRelease(htmlcode) getRelease(htmlcode)
), # str(re.search('\d{4}',getRelease(a)).group()), ), # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "", "actor_photo": "",
"website": chosen_url, "website": chosen_url,
"source": "fanza.py", "source": "fanza.py",
} }
except: except:
data = { data = {
"title": "", "title": "",
} }
js = json.dumps( js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) # .encode('UTF-8') ) # .encode('UTF-8')
return js return js
if __name__ == "__main__": if __name__ == "__main__":
# print(main("DV-1562")) # print(main("DV-1562"))
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
# print(main("ipx292")) # print(main("ipx292"))
pass pass

View File

@ -1,162 +1,162 @@
import re import re
from lxml import etree#need install from lxml import etree#need install
import json import json
import ADC_function import ADC_function
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(htmlcode): #获取厂商 def getTitle(htmlcode): #获取厂商
#print(htmlcode) #print(htmlcode)
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']")
result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1) result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1)
#print(result2) #print(result2)
return result2 return result2
def getActor(htmlcode): def getActor(htmlcode):
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']")
return result return result
except: except:
return '' return ''
def getStudio(htmlcode): #获取厂商 def getStudio(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
return result return result
def getNum(htmlcode): #获取番号 def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
#print(result) #print(result)
return result return result
def getRelease(htmlcode2): # def getRelease(htmlcode2): #
#a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') #a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
html=etree.fromstring(htmlcode2,etree.HTMLParser()) html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result return result
def getCover(htmlcode,number,htmlcode2): #获取厂商 # def getCover(htmlcode,number,htmlcode2): #获取厂商 #
#a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') #a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
if result == '': if result == '':
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']") result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']")
return 'https://fc2club.com' + result2 return 'https://fc2club.com' + result2
return 'http:' + result return 'http:' + result
def getOutline(htmlcode2): #获取番号 # def getOutline(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result return result
def getTag(htmlcode): #获取番号 def getTag(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
return result.strip(" ['']").replace("'",'').replace(' ','') return result.strip(" ['']").replace("'",'').replace(' ','')
def getYear(release): def getYear(release):
try: try:
result = re.search('\d{4}',release).group() result = re.search('\d{4}',release).group()
return result return result
except: except:
return '' return ''
def getTitle_fc2com(htmlcode): #获取厂商 def getTitle_fc2com(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0] result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
return result return result
def getActor_fc2com(htmlcode): def getActor_fc2com(htmlcode):
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
return result return result
except: except:
return '' return ''
def getStudio_fc2com(htmlcode): #获取厂商 def getStudio_fc2com(htmlcode): #获取厂商
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']") result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
return result return result
except: except:
return '' return ''
def getNum_fc2com(htmlcode): #获取番号 def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result return result
def getRelease_fc2com(htmlcode2): # def getRelease_fc2com(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser()) html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result return result
def getCover_fc2com(htmlcode2): #获取厂商 # def getCover_fc2com(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
return 'http:' + result return 'http:' + result
def getOutline_fc2com(htmlcode2): #获取番号 # def getOutline_fc2com(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser()) html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',') result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result return result
def getTag_fc2com(number): #获取番号 def getTag_fc2com(number): #获取番号
htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape')) htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape'))
result = re.findall('"tag":"(.*?)"', htmlcode) result = re.findall('"tag":"(.*?)"', htmlcode)
return result return result
def getYear_fc2com(release): def getYear_fc2com(release):
try: try:
result = re.search('\d{4}',release).group() result = re.search('\d{4}',release).group()
return result return result
except: except:
return '' return ''
def main(number): def main(number):
try: try:
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/') htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/')
htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html') htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html')
actor = getActor(htmlcode) actor = getActor(htmlcode)
if getActor(htmlcode) == '': if getActor(htmlcode) == '':
actor = 'FC2系列' actor = 'FC2系列'
dic = { dic = {
'title': getTitle(htmlcode), 'title': getTitle(htmlcode),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': '',#str(re.search('\d{4}',getRelease(number)).group()), 'year': '',#str(re.search('\d{4}',getRelease(number)).group()),
'outline': '',#getOutline(htmlcode2), 'outline': '',#getOutline(htmlcode2),
'runtime': getYear(getRelease(htmlcode)), 'runtime': getYear(getRelease(htmlcode)),
'director': getStudio(htmlcode), 'director': getStudio(htmlcode),
'actor': actor, 'actor': actor,
'release': getRelease(number), 'release': getRelease(number),
'number': 'FC2-'+number, 'number': 'FC2-'+number,
'label': '', 'label': '',
'cover': getCover(htmlcode,number,htmlcode2), 'cover': getCover(htmlcode,number,htmlcode2),
'imagecut': 0, 'imagecut': 0,
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'actor_photo':'', 'actor_photo':'',
'website': 'https://fc2club.com//html/FC2-' + number + '.html', 'website': 'https://fc2club.com//html/FC2-' + number + '.html',
'source':'https://fc2club.com//html/FC2-' + number + '.html', 'source':'https://fc2club.com//html/FC2-' + number + '.html',
} }
if dic['title'] == '': if dic['title'] == '':
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'}) htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'})
actor = getActor(htmlcode) actor = getActor(htmlcode)
if getActor(htmlcode) == '': if getActor(htmlcode) == '':
actor = 'FC2系列' actor = 'FC2系列'
dic = { dic = {
'title': getTitle_fc2com(htmlcode2), 'title': getTitle_fc2com(htmlcode2),
'studio': getStudio_fc2com(htmlcode2), 'studio': getStudio_fc2com(htmlcode2),
'year': '', # str(re.search('\d{4}',getRelease(number)).group()), 'year': '', # str(re.search('\d{4}',getRelease(number)).group()),
'outline': getOutline_fc2com(htmlcode2), 'outline': getOutline_fc2com(htmlcode2),
'runtime': getYear_fc2com(getRelease(htmlcode2)), 'runtime': getYear_fc2com(getRelease(htmlcode2)),
'director': getStudio_fc2com(htmlcode2), 'director': getStudio_fc2com(htmlcode2),
'actor': actor, 'actor': actor,
'release': getRelease_fc2com(number), 'release': getRelease_fc2com(number),
'number': 'FC2-' + number, 'number': 'FC2-' + number,
'cover': getCover_fc2com(htmlcode2), 'cover': getCover_fc2com(htmlcode2),
'imagecut': 0, 'imagecut': 0,
'tag': getTag_fc2com(number), 'tag': getTag_fc2com(number),
'label': '', 'label': '',
'actor_photo': '', 'actor_photo': '',
'website': 'http://adult.contents.fc2.com/article/' + number + '/', 'website': 'http://adult.contents.fc2.com/article/' + number + '/',
'source': 'http://adult.contents.fc2.com/article/' + number + '/', 'source': 'http://adult.contents.fc2.com/article/' + number + '/',
} }
except Exception as e: except Exception as e:
# (TODO) better handle this # (TODO) better handle this
# print(e) # print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
return js return js
#print(main('1252953')) #print(main('1252953'))

View File

@ -1,138 +1,139 @@
import re import re
from pyquery import PyQuery as pq#need install from pyquery import PyQuery as pq#need install
from lxml import etree#need install from lxml import etree#need install
from bs4 import BeautifulSoup#need install from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'star-name'}) a = soup.find_all(attrs={'class': 'star-name'})
d={} d={}
for i in a: for i in a:
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
return d return d
def getTitle(htmlcode): #获取标题 def getTitle(htmlcode): #获取标题
doc = pq(htmlcode) doc = pq(htmlcode)
title=str(doc('div.container h3').text()).replace(' ','-') title=str(doc('div.container h3').text()).replace(' ','-')
try: try:
title2 = re.sub('n\d+-','',title) title2 = re.sub('n\d+-','',title)
return title2 return title2
except: except:
return title return title
def getStudio(htmlcode): #获取厂商 def getStudio(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
return result return result
def getYear(htmlcode): #获取年份 def getYear(htmlcode): #获取年份
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
return image.attr('href') return image.attr('href')
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getRuntime(htmlcode): #获取分钟 def getRuntime(htmlcode): #获取分钟
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find(text=re.compile('分鐘')) a = soup.find(text=re.compile('分鐘'))
return a return a
def getActor(htmlcode): #获取女优 def getActor(htmlcode): #获取女优
b=[] b=[]
soup=BeautifulSoup(htmlcode,'lxml') soup=BeautifulSoup(htmlcode,'lxml')
a=soup.find_all(attrs={'class':'star-name'}) a=soup.find_all(attrs={'class':'star-name'})
for i in a: for i in a:
b.append(i.get_text()) b.append(i.get_text())
return b return b
def getNum(htmlcode): #获取番号 def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result return result
def getDirector(htmlcode): #获取导演 def getDirector(htmlcode): #获取导演
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
return result return result
def getOutline(htmlcode): #获取演员 def getOutline(htmlcode): #获取演员
doc = pq(htmlcode) doc = pq(htmlcode)
result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text())
return result return result
def getSerise(htmlcode): def getSerise(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
return result return result
def getTag(htmlcode): # 获取演员 def getTag(htmlcode): # 获取演员
tag = [] tag = []
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'genre'}) a = soup.find_all(attrs={'class': 'genre'})
for i in a: for i in a:
if 'onmouseout' in str(i): if 'onmouseout' in str(i):
continue continue
tag.append(i.get_text()) tag.append(i.get_text())
return tag return tag
def main(number): def main(number):
try: try:
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
try: try:
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
except: except:
dww_htmlcode = '' dww_htmlcode = ''
dic = { dic = {
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
'outline': getOutline(dww_htmlcode), 'outline': getOutline(dww_htmlcode),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
'release': getRelease(htmlcode), 'release': getRelease(htmlcode),
'number': getNum(htmlcode), 'number': getNum(htmlcode),
'cover': getCover(htmlcode), 'cover': getCover(htmlcode),
'imagecut': 1, 'imagecut': 1,
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'label': getSerise(htmlcode), 'label': getSerise(htmlcode),
'actor_photo': getActorPhoto(htmlcode), 'actor_photo': getActorPhoto(htmlcode),
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source' : 'javbus.py', 'source' : 'javbus.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except: except:
return main_uncensored(number) return main_uncensored(number)
def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/' + number) def main_uncensored(number): # 无码
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) htmlcode = get_html('https://www.javbus.com/' + number)
if getTitle(htmlcode) == '': dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) if getTitle(htmlcode) == '':
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
dic = { dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), dic = {
'studio': getStudio(htmlcode), 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))).replace(getNum(htmlcode)+'-', ''),
'year': getYear(htmlcode), 'studio': getStudio(htmlcode),
'outline': getOutline(dww_htmlcode), 'year': getYear(htmlcode),
'runtime': getRuntime(htmlcode), 'outline': getOutline(dww_htmlcode),
'director': getDirector(htmlcode), 'runtime': getRuntime(htmlcode),
'actor': getActor(htmlcode), 'director': getDirector(htmlcode),
'release': getRelease(htmlcode), 'actor': getActor(htmlcode),
'number': getNum(htmlcode), 'release': getRelease(htmlcode),
'cover': getCover(htmlcode), 'number': getNum(htmlcode),
'tag': getTag(htmlcode), 'cover': getCover(htmlcode),
'label': getSerise(htmlcode), 'tag': getTag(htmlcode),
'imagecut': 0, 'label': getSerise(htmlcode),
'actor_photo': '', 'imagecut': 0,
'website': 'https://www.javbus.com/' + number, 'actor_photo': '',
'source': 'javbus.py', 'website': 'https://www.javbus.com/' + number,
} 'source': 'javbus.py',
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') }
return js js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js

View File

@ -1,123 +1,123 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath("/html/body/section/div/h2/strong/text()")[0] result = html.xpath("/html/body/section/div/h2/strong/text()")[0]
return result return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',') a = actor.split(',')
d={} d={}
for i in a: for i in a:
p={i:''} p={i:''}
d.update(p) d.update(p)
return d return d
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+') return str(result2 + result1).strip('+')
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search('\d{4}', getRelease).group()) result = str(re.search('\d{4}', getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',') return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',')
def getCover_small(a, index=0): def getCover_small(a, index=0):
# same issue mentioned below, # same issue mentioned below,
# javdb sometime returns multiple results # javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number # DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result: if not 'https' in result:
result = 'https:' + result result = 'https:' + result
return result return result
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']") result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']")
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(htmlcode): def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result return result
def main(number): def main(number):
try: try:
number = number.upper() number = number.upper()
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results, # javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for # and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one # iterate all candidates and find the match one
urls = html.xpath('//*[@id="videos"]/div/div/a/@href') urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
correct_url = urls[ids.index(number)] correct_url = urls[ids.index(number)]
detail_page = get_html('https://javdb.com' + correct_url) detail_page = get_html('https://javdb.com' + correct_url)
dic = { dic = {
'actor': getActor(detail_page), 'actor': getActor(detail_page),
'title': getTitle(detail_page), 'title': getTitle(detail_page),
'studio': getStudio(detail_page), 'studio': getStudio(detail_page),
'outline': getOutline(detail_page), 'outline': getOutline(detail_page),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(detail_page),
'director': getDirector(detail_page), 'director': getDirector(detail_page),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
'number': getNum(detail_page), 'number': getNum(detail_page),
'cover': getCover(detail_page), 'cover': getCover(detail_page),
'cover_small': getCover_small(query_result, index=ids.index(number)), 'cover_small': getCover_small(query_result, index=ids.index(number)),
'imagecut': 3, 'imagecut': 3,
'tag': getTag(detail_page), 'tag': getTag(detail_page),
'label': getLabel(detail_page), 'label': getLabel(detail_page),
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(getActor(detail_page)), 'actor_photo': getActorPhoto(getActor(detail_page)),
'website': 'https://javdb.com' + correct_url, 'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py', 'source': 'javdb.py',
} }
except Exception as e: except Exception as e:
# print(e) # print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
#print(main('ipx-292')) #print(main('ipx-292'))

View File

@ -1,108 +1,108 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(a):
try: try:
html = etree.fromstring(a, etree.HTMLParser()) html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
return result.replace('/', ',') return result.replace('/', ',')
except: except:
return '' return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a): def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','') return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a): def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getNum(a): def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search('\d{4}',getRelease).group()) result = str(re.search('\d{4}',getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(a): def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+') return str(result1 + result2).strip('+')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','')
def getCover(htmlcode): def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src # /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
return result return result
def getDirector(a): def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n') '\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getOutline(htmlcode): def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result return result
def main(number2): def main(number2):
number=number2.upper() number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
dic = { dic = {
'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''),
'studio': getStudio(a), 'studio': getStudio(a),
'outline': getOutline(htmlcode), 'outline': getOutline(htmlcode),
'runtime': getRuntime(a), 'runtime': getRuntime(a),
'director': getDirector(a), 'director': getDirector(a),
'actor': getActor(a), 'actor': getActor(a),
'release': getRelease(a), 'release': getRelease(a),
'number': getNum(a), 'number': getNum(a),
'cover': getCover(htmlcode), 'cover': getCover(htmlcode),
'imagecut': 0, 'imagecut': 0,
'tag': getTag(a), 'tag': getTag(a),
'label':getLabel(a), 'label':getLabel(a),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '', 'actor_photo': '',
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
'source': 'mgstage.py', 'source': 'mgstage.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
#print(main('SIRO-3607')) #print(main('SIRO-3607'))

41
TestPathNFO.txt Normal file
View File

@ -0,0 +1,41 @@
/Volumes/Adult/Files/ノ瀬アメリ/Tokyo Hot N0646.avi
/Volumes/Adult/Files/ノ瀬アメリ/MKBD_S03-MaRieS.mp4
/Volumes/192.168.2.100/Adult/Files/Aki Sasaki Megapack/HODV-21299.mkv
/Volumes/Adult/Files/[Tokyo-Hot] [n1180] 美人秘書3穴串刺奉仕残業 (中井綾香 Ayaka Nakai)/(Tokyo-Hot)(n1180)美人秘書3穴串刺奉仕残業 中井綾香.mp4
/mcdv47.avi
/mcdv-47.avi
/mcdv-047.mp4
/mcdv047.mp4
/mcdv0047.mp4
/1pondo-070409_621.mp4
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#1(181222)@RUNBKK/No-Watermarked/HOBD00015.FHD2.wmv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/RBD-406_1.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/MDYD-664B.mp4
/Volumes/Adult/Files/107NTTR-037A.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/SNIS-986 国民的アイドル アドレナリン大爆発禁欲1ヶ月後の性欲剥き出し焦らされトランスFUCK 三上悠亜【桃花族】.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/UPSM-109_2.mkv
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#2(181231)@RUNBKK/No-Watermarked/PPT003.SD3.wmv
/Volumes/Adult/Files/波多野结衣/THE波多野結衣 ぶっかけ50連発 CD1.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 後編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 前編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/加勒比 062212-055 夫の目の前で妻が ~元上司に縛られて~波多野結衣~.rmvb
/Volumes/Adult/Files/波多野结衣/022213-271-carib-whole_s.mp4
/Volumes/Adult/Files/SKYHD-001~010/SKYHD-009_H265.mkv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/大桥步兵合集/032015_161-caribpr-high.mp4
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/118ppt00016hhb2.mkv
/Volumes/Adult/Files/tia/soe935C.HD.wmv
/Volumes/Adult/Files/SKYHD-011~020/SKYHD-020_H265.mkv
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/VR/sivr00008_E.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/SHKD-744 営業課長の湿ったパンスト 里美ゆりあ.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/ABP-627 裏・鈴村あいり-鈴村あいりのオトナの激情SEX4本番 鈴村あいり.MP4
/Volumes/Adult/Files/Rating Top 30 JAV pack/20 ABP-408 上原瑞穂/上原瑞穂 ABP-408 无码流出片段/[ThZu.Cc]20150909164411.m2ts
/Volumes/Adult/Files/Caribbean-101717-520-HD/100917-515/100917-515-carib-1080p.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/ノ瀬アメリ/一ノ瀬アメリ~加勒比 VERY SEXY.wmv
/Volumes/Adult/Files/ノ瀬アメリ/20101202一瀬アメリ - 東京ブルドック05(inu006).avi
/Volumes/Adult/Files/ノ瀬アメリ/Sky Angel Vol 80 - CD2.mp4
/Volumes/Adult/Files/Mika Sumire すみれ美香/Caribbean-091818-755.mp4
/Volumes/Adult/Files/Takizawa Rola/[HD]abp-031C.wmv
/Volumes/Adult/Files/Takizawa Rola/ABP-013HDA.wmv

51
TestPathSpecial.txt Normal file
View File

@ -0,0 +1,51 @@
/Volumes/192.168.2.100/Adult/Files/Aki Sasaki Megapack/HODV-21222.mkv
/Volumes/Adult/Files/ノ瀬アメリ/Tokyo Hot N0646.avi
/Volumes/Adult/Files/ノ瀬アメリ/MKBD_S03-MaRieS.mp4
/Volumes/192.168.2.100/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-1 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999A 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-A 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-C 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-B 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935C.HD.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935B.HD.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935A.HD.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935D.HD.wmv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/[Tokyo-Hot] [n1180] 美人秘書3穴串刺奉仕残業 (中井綾香 Ayaka Nakai)/(Tokyo-Hot)(n1180)美人秘書3穴串刺奉仕残業 中井綾香.mp4
/mcdv47.avi
/mcdv-47.avi
/mcdv-047.mp4
/mcdv047.mp4
/mcdv0047.mp4
/1pondo-070409_621.mp4
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#1(181222)@RUNBKK/No-Watermarked/HOBD00015.FHD2.wmv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/RBD-406_1.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/MDYD-664B.mp4
/Volumes/Adult/Files/107NTTR-037A.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/SNIS-986 国民的アイドル アドレナリン大爆発禁欲1ヶ月後の性欲剥き出し焦らされトランスFUCK 三上悠亜【桃花族】.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/UPSM-109_2.mkv
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#2(181231)@RUNBKK/No-Watermarked/PPT003.SD3.wmv
/Volumes/Adult/Files/波多野结衣/THE波多野結衣 ぶっかけ50連発 CD1.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 後編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 前編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/加勒比 062212-055 夫の目の前で妻が ~元上司に縛られて~波多野結衣~.rmvb
/Volumes/Adult/Files/波多野结衣/022213-271-carib-whole_s.mp4
/Volumes/Adult/Files/SKYHD-001~010/SKYHD-009_H265.mkv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/大桥步兵合集/032015_161-caribpr-high.mp4
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/118ppt00016hhb2.mkv
/Volumes/Adult/Files/SKYHD-011~020/SKYHD-020_H265.mkv
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/VR/sivr00008_E.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/SHKD-744 営業課長の湿ったパンスト 里美ゆりあ.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/ABP-627 裏・鈴村あいり-鈴村あいりのオトナの激情SEX4本番 鈴村あいり.MP4
/Volumes/Adult/Files/Rating Top 30 JAV pack/20 ABP-408 上原瑞穂/上原瑞穂 ABP-408 无码流出片段/[ThZu.Cc]20150909164411.m2ts
/Volumes/Adult/Files/Caribbean-101717-520-HD/100917-515/100917-515-carib-1080p.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/ノ瀬アメリ/一ノ瀬アメリ~加勒比 VERY SEXY.wmv
/Volumes/Adult/Files/ノ瀬アメリ/20101202一瀬アメリ - 東京ブルドック05(inu006).avi
/Volumes/Adult/Files/ノ瀬アメリ/Sky Angel Vol 80 - CD2.mp4
/Volumes/Adult/Files/Mika Sumire すみれ美香/Caribbean-091818-755.mp4
/Volumes/Adult/Files/Takizawa Rola/[HD]abp-031C.wmv
/Volumes/Adult/Files/Takizawa Rola/ABP-013HDA.wmv

50
TestPaths.txt Normal file
View File

@ -0,0 +1,50 @@
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#1(181222)@RUNBKK/No-Watermarked/HOBD00015.FHD2.wmv
/1pondo-070409_621.mp4
/Volumes/Adult/Files/107NTTR-037.mp4
/Volumes/Adult/Files/107NTTR-037A.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/TEK-097 ふたりは無敵.wmv
/Volumes/Adult/Files/Yua.Mikami-PML/SNIS-986 国民的アイドル アドレナリン大爆発禁欲1ヶ月後の性欲剥き出し焦らされトランスFUCK 三上悠亜【桃花族】.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/SSNI-030 三上悠亜ファン感謝祭 国民的アイドル×一般ユーザー20人ガチファンとSEX解禁ハメまくりスペシャル【桃花族】.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/MIDD-893A.mkv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/UPSM-109_2.mkv
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#2(181231)@RUNBKK/No-Watermarked/PPT003.SD3.wmv
/Volumes/Adult/Files/波多野结衣/THE波多野結衣 ぶっかけ50連発 CD1.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 後編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 前編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/加勒比 062212-055 夫の目の前で妻が ~元上司に縛られて~波多野結衣~.rmvb
/Volumes/Adult/Files/波多野结衣/022213-271-carib-whole_s.mp4
/Volumes/Adult/Files/桜木凛 Rin Sakuragi FHD Collection Pack Vol/BBI-183.wmv
/Volumes/Adult/Files/NOP-019 芭蕾教室 水嶋あずみ/NOP019B.HD.wmv
/Volumes/Adult/Files/一ノ瀬アメリ part2/栗栖エリカ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/一ノ瀬アメリ part2/Max Girls/Max Girls 24(xv804)伊東遥,Rio,小沢アリス,葉月しおり,一ノ瀬アメリ,ひなた結衣,藤崎りお.avi
/Volumes/Adult/Files/一ノ瀬アメリ part2/瀬アメリAmeri Ichinose/20091127一瀬アメリ - 一見面就做愛(xv801).avi
/Volumes/Adult/Files/Aki Sasaki Megapack/MSTG-003.mkv
/Volumes/Adult/Files/SKYHD-001~010/SKYHD-009_H265.mkv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/大桥步兵合集/032015_161-caribpr-high.mp4
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/(PRESTIGE)(ABP-171)彼女のお姉さんは、誘惑ヤリたがり娘。桃谷エリカ.wmv
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/(PRESTIGE)(ABP-145)濃密な接吻と欲情ベロキス性交 04 桃谷エリカ.wmv
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/118ppt00016hhb2.mkv
/Volumes/Adult/Files/tia/soe935C.HD.wmv
/Volumes/Adult/Files/SKYHD-011~020/SKYHD-020_H265.mkv
/Volumes/Adult/Files/sakumomo1203-PML/IDBD-795 ももに夢中 2018年日本人にもっとも愛された女優桜空ももPREMIUM BOX8時間BEST.mp4
/Volumes/Adult/Files/sakumomo1203-PML/IDBD-768 Gカップグラビアアイドル桜空もも初ベスト 原石 2【桃花族】.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/VR/sivr00008_E.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/SHKD-744 営業課長の湿ったパンスト 里美ゆりあ.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/ABP-627 裏・鈴村あいり-鈴村あいりのオトナの激情SEX4本番 鈴村あいり.MP4
/Volumes/Adult/Files/Rating Top 30 JAV pack/20 ABP-408 上原瑞穂/上原瑞穂 ABP-408 无码流出片段/[ThZu.Cc]20150909164411.m2ts
/Volumes/Adult/Files/Caribbean-101717-520-HD/100917-515/100917-515-carib-1080p.mp4
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#3(190119)@RUNBKK/No-Watermarked/SOE976.FHD3.wmv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/RBD-406_1.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/MDYD-664B.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/ノ瀬アメリ/一ノ瀬アメリ~加勒比 VERY SEXY.wmv
/Volumes/Adult/Files/ノ瀬アメリ/20101202一瀬アメリ - 東京ブルドック05(inu006).avi
/Volumes/Adult/Files/ノ瀬アメリ/Sky Angel Vol 80 - CD2.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20100226一瀬アメリ - OL Style 制服(xv827).avi
/Volumes/Adult/Files/Mika Sumire すみれ美香/Caribbean-091818-755.mp4
/Volumes/Adult/Files/[Tokyo-Hot] [n1180] 美人秘書3穴串刺奉仕残業 (中井綾香 Ayaka Nakai)/(Tokyo-Hot)(n1180)美人秘書3穴串刺奉仕残業 中井綾香.mp4
/Volumes/Adult/Files/Takizawa Rola/[HD]abp-031C.wmv
/Volumes/Adult/Files/Takizawa Rola/ABP-013HDA.wmv
/Volumes/Adult/Files/Uncensored Mosaic Removal Megapack/ADN-017(Asami Ogawa).mp4

29
config.ini Normal file → Executable file
View File

@ -1,28 +1,35 @@
[common] [common]
main_mode=1 main_mode=2
failed_output_folder=failed # 路径均为绝对路径,不要写入" '等符号
success_output_folder=JAV_output search_folder= /Volumes/192.168.2.100/Adult/AVTest
# 如果failed_output_folder 为空,抓取不到相关信息的视频将不回移动
failed_output_folder= /Volumes/192.168.2.100/Adult/UnknownStars
success_output_folder= /Volumes/192.168.2.100/Adult/Files
#临时资源存储路径比如xxx.nfo 海报图
temp_folder= /Volumes/192.168.2.100/Adult/temp
# 如果是远程挂载的盘符建议不开启创建软连接软连接链接的是绝对路径远程NAS上的路径和本地挂载的路径一般不同。
soft_link=0 soft_link=0
[proxy] [proxy]
proxy=127.0.0.1:1080 #例子为socks代理配置可以 =后留空
timeout=10 proxy= socks5h://127.0.0.1:1081
retry=3 timeout= 10
retry= 5
[Name_Rule] [Name_Rule]
location_rule=actor+'/'+number location_rule= actor+'/'+number
naming_rule=number+'-'+title naming_rule= number+'-'+title
[update] [update]
update_check=1 update_check=1
[media] [media]
media_warehouse=emby
#emby or plex or kodi ,emby=jellyfin #emby or plex or kodi ,emby=jellyfin
media_warehouse=EMBY
[escape] [escape]
literals=\() literals=\()
folders=failed,JAV_output folders=/Volumes/Adult/UnknownStars,/Volumes/Adult/Stars
[debug_mode] [debug_mode]
switch=1 switch=1

1609
core.py

File diff suppressed because it is too large Load Diff

View File

@ -1,73 +0,0 @@
import json
from bs4 import BeautifulSoup
from lxml import html
from ADC_function import post_html
def main(number: str) -> json:
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
if "/video/" in result.url:
data = parse_info(soup=soup)
dic = {
"title": get_title(lx=lx),
"studio": "",
"year": data["release"][:4],
"outline": get_outline(lx=lx),
"director": "",
"cover": get_cover(lx=lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "jav321.py",
**data,
}
else:
dic = {}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
def parse_info(soup: BeautifulSoup) -> dict:
data = str(soup.select_one("div.row > div.col-md-9")).split("<br/>")
return {
"actor": get_anchor_info(h=data[0]),
"label": get_anchor_info(h=data[1]),
"tag": get_anchor_info(h=data[2]),
"number": get_text_info(h=data[3]),
"release": get_text_info(h=data[4]),
"runtime": get_text_info(h=data[5]),
}
def get_anchor_info(h: str) -> str:
result = []
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
for d in data:
result.append(d.text)
return ",".join(result)
def get_text_info(h: str) -> str:
return h.split(": ")[1]
def get_cover(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
if __name__ == "__main__":
print(main("wmc-002"))

BIN
readme/._readme1.PNG Executable file

Binary file not shown.

BIN
readme/._readme2.PNG Executable file

Binary file not shown.

BIN
readme/._readme4.PNG Executable file

Binary file not shown.

0
readme/This is readms.md's images folder Normal file → Executable file
View File

0
readme/flow_chart2.png Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 101 KiB

After

Width:  |  Height:  |  Size: 101 KiB

0
readme/readme1.PNG Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

0
readme/readme2.PNG Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 3.4 KiB

0
readme/readme3.PNG Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

0
readme/readme4.PNG Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

0
readme/single.gif Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 68 KiB

After

Width:  |  Height:  |  Size: 68 KiB

View File

@ -0,0 +1 @@
1

BIN
resource/flow_chart2.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

BIN
resource/readme1.PNG Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 KiB

BIN
resource/readme2.PNG Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

BIN
resource/readme3.PNG Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

BIN
resource/readme4.PNG Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

1
resource/ruquirments.txt Executable file
View File

@ -0,0 +1 @@
pipenv install -rlxml bs4 pillow pyquery

BIN
resource/single.gif Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

View File

@ -1,4 +0,0 @@
lxml
bs4
pillow
pyquery

80
test.py Executable file
View File

@ -0,0 +1,80 @@
import os
import re
from itertools import groupby
import fuckit as fuckit
import pandas as pd
from tenacity import retry, stop_after_delay, wait_fixed
def go():
a = [1, 2, 3, 4, 5, 6]
# [print(x) for x in a]
# [print(x) for x in a]
a1 = groupby(a, key=lambda k: (k / 2))
for i in a1:
print(i)
for i in a1:
print(i)
class TryDo:
def __init__(self, func, times=3):
self.tries = times
self.func = func
def __iter__(self):
self.currentTry = 1
return self
def __next__(self):
if self.currentTry > self.tries:
raise StopIteration(False)
else:
self.currentTry += 1
self.func()
raise StopIteration(True)
# def do(self):
@retry(stop=stop_after_delay(3), wait=wait_fixed(2))
def stop_after_10_s():
print("Stopping after 10 seconds")
raise Exception
# f = iter( TryDo(do_something, 5))
# stop_after_10_s()
def errorfunc():
raise Exception
def okfunc():
print("ok")
# with fuckit:
# errorfunc()
# okfunc()
# re.match()
r = re.search(r'(?<=999)-?((?P<alpha>([A-Z](?![A-Z])))|(?P<num>\d(?!\d)))', "IPTD-999-B-彼女の姉貴とイケナイ関係-RIO", re.I)
#
print(r.groupdict())
print(r.groupdict()['alpha'])
print(r.group(2))
import re
line = "Cats are smarter than dogs"
matchObj = re.search(r'(?<=a)(.*) are (.*?) .*', line, re.M | re.I)
if matchObj:
print("matchObj.group() : ", matchObj.group())
print("matchObj.group(1) : ", matchObj.group(1))
print("matchObj.group(2) : ", matchObj.group(2))
else:
print("No match!!")
# print(r[-1])
# print(newList)

0
update_check.json Normal file → Executable file
View File