Compare commits

..

1 Commits

56 changed files with 1585 additions and 3371 deletions

1
.gitattributes vendored
View File

@ -1 +0,0 @@
*.py text=auto eol=lf

1
.gitignore vendored
View File

@ -1,4 +1,3 @@
*.DS_Store
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]

2
.idea/.gitignore generated vendored
View File

@ -1,2 +0,0 @@
# Default ignored files
/workspace.xml

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (AV_Data_Capture)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,19 +0,0 @@
<component name="ProjectDictionaryState">
<dictionary name="tanpengsccd">
<words>
<w>avsox</w>
<w>emby</w>
<w>fanart</w>
<w>fanza</w>
<w>javbus</w>
<w>javdb</w>
<w>jellyfin</w>
<w>khtml</w>
<w>kodi</w>
<w>mgstage</w>
<w>plex</w>
<w>pondo</w>
<w>rmvb</w>
</words>
</dictionary>
</component>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml generated
View File

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (AV_Data_Capture)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated
View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/AV_Data_Capture.iml" filepath="$PROJECT_DIR$/.idea/AV_Data_Capture.iml" />
</modules>
</component>
</project>

6
.idea/other.xml generated
View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PySciProjectComponent">
<option name="PY_SCI_VIEW_SUGGESTED" value="true" />
</component>
</project>

6
.idea/vcs.xml generated
View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -1,127 +1,97 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests import requests
from configparser import ConfigParser from configparser import ConfigParser
import os import os
import re import re
import time import time
import sys import sys
from lxml import etree
import sys config_file='config.ini'
import io config = ConfigParser()
from ConfigApp import ConfigApp
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) if os.path.exists(config_file):
# sys.setdefaultencoding('utf-8') try:
config.read(config_file, encoding='UTF-8')
# config_file='config.ini' except:
# config = ConfigParser() print('[-]Config.ini read failed! Please use the offical file!')
else:
# if os.path.exists(config_file): print('[+]config.ini: not found, creating...')
# try: with open("config.ini", "wt", encoding='UTF-8') as code:
# config.read(config_file, encoding='UTF-8') print("[proxy]",file=code)
# except: print("proxy=127.0.0.1:1080",file=code)
# print('[-]Config.ini read failed! Please use the offical file!') print("timeout=10", file=code)
# else: print("retry=3", file=code)
# print('[+]config.ini: not found, creating...',end='') print("", file=code)
# with open("config.ini", "wt", encoding='UTF-8') as code: print("[Name_Rule]", file=code)
# print("[common]", file=code) print("location_rule='JAV_output/'+actor+'/'+number",file=code)
# print("main_mode = 1", file=code) print("naming_rule=number+'-'+title",file=code)
# print("failed_output_folder = failed", file=code) print("", file=code)
# print("success_output_folder = JAV_output", file=code) print("[update]",file=code)
# print("", file=code) print("update_check=1",file=code)
# print("[proxy]",file=code) print("", file=code)
# print("proxy=127.0.0.1:1081",file=code) print("[media]", file=code)
# print("timeout=10", file=code) print("media_warehouse=emby", file=code)
# print("retry=3", file=code) print("#emby or plex", file=code)
# print("", file=code) print("#plex only test!", file=code)
# print("[Name_Rule]", file=code) print("", file=code)
# print("location_rule=actor+'/'+number",file=code) print("[directory_capture]", file=code)
# print("naming_rule=number+'-'+title",file=code) print("switch=0", file=code)
# print("", file=code) print("directory=", file=code)
# print("[update]",file=code) print("", file=code)
# print("update_check=1",file=code) print("everyone switch:1=on, 0=off", file=code)
# print("", file=code) time.sleep(2)
# print("[media]", file=code) print('[+]config.ini: created!')
# print("media_warehouse=emby", file=code) try:
# print("#emby plex kodi", file=code) config.read(config_file, encoding='UTF-8')
# print("", file=code) except:
# print("[escape]", file=code) print('[-]Config.ini read failed! Please use the offical file!')
# print("literals=\\", file=code)
# print("", file=code) def ReadMediaWarehouse():
# print("[movie_location]", file=code) return config['media']['media_warehouse']
# print("path=", file=code)
# print("", file=code) def UpdateCheckSwitch():
# print('.',end='') check=str(config['update']['update_check'])
# time.sleep(2) if check == '1':
# print('.') return '1'
# print('[+]config.ini: created!') elif check == '0':
# print('[+]Please restart the program!') return '0'
# time.sleep(4) elif check == '':
# os._exit(0) return '0'
# try: def get_html(url,cookies = None):#网页请求核心
# config.read(config_file, encoding='UTF-8') try:
# except: proxy = config['proxy']['proxy']
# print('[-]Config.ini read failed! Please use the offical file!') timeout = int(config['proxy']['timeout'])
retry_count = int(config['proxy']['retry'])
config = ConfigApp() except:
print('[-]Proxy config error! Please check the config.')
i = 0
def get_network_settings(): while i < retry_count:
try: try:
proxy = config.proxy if not str(config['proxy']['proxy']) == '':
timeout = int(config.timeout) proxies = {"http": "http://" + proxy,"https": "https://" + proxy}
retry_count = int(config.retry) headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
assert timeout > 0 getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies)
assert retry_count > 0 getweb.encoding = 'utf-8'
except: return getweb.text
raise ValueError("[-]Proxy config error! Please check the config.") else:
return proxy, timeout, retry_count headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
def getDataState(json_data): # 元数据获取失败检测 getweb.encoding = 'utf-8'
if json_data['title'] == '' or json_data['title'] == 'None' or json_data['title'] == 'null': return getweb.text
return 0 except requests.exceptions.RequestException:
else: i += 1
return 1 print('[-]Connect retry '+str(i)+'/'+str(retry_count))
except requests.exceptions.ConnectionError:
def ReadMediaWarehouse(): i += 1
return config.media_server print('[-]Connect retry '+str(i)+'/'+str(retry_count))
except requests.exceptions.ProxyError:
def UpdateCheckSwitch(): i += 1
check=str(config.update_check) print('[-]Connect retry '+str(i)+'/'+str(retry_count))
if check == '1': except requests.exceptions.ConnectTimeout:
return '1' i += 1
elif check == '0': print('[-]Connect retry '+str(i)+'/'+str(retry_count))
return '0' print('[-]Connect Failed! Please check your Proxy or Network!')
elif check == '':
return '0'
def getXpathSingle(htmlcode,xpath):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result1 = str(html.xpath(xpath)).strip(" ['']")
return result1
def get_html(url,cookies = None):#网页请求核心
proxy, timeout, retry_count = get_network_settings()
i = 0
print(url)
while i < retry_count:
try:
if not proxy == '':
proxies = {"http": proxy, "https": proxy}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
getweb = requests.get(str(url), headers=headers, timeout=timeout, proxies=proxies, cookies=cookies)
getweb.encoding = 'utf-8'
return getweb.text
else:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
getweb.encoding = 'utf-8'
return getweb.text
except Exception as e:
print(e)
i += 1
print('[-]Connect retry '+str(i)+'/'+str(retry_count))
print('[-]Connect Failed! Please check your Proxy or Network!')

View File

@ -1,416 +1,153 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import glob import glob
import os import os
import time import time
import fuckit import re
from tenacity import retry, stop_after_delay, wait_fixed import sys
import json from ADC_function import *
import shutil import json
import itertools import shutil
import argparse from configparser import ConfigParser
from pathlib import Path os.chdir(os.getcwd())
from core import * # ============global var===========
from ConfigApp import ConfigApp
from PathNameProcessor import PathNameProcessor version='1.3'
# TODO 封装聚合解耦CORE config = ConfigParser()
# TODO (学习)统一依赖管理工具 config.read(config_file, encoding='UTF-8')
# TODO 不同媒体服务器尽量兼容统一一种元数据 如nfo 海报等embyjellyfinplex
# TODO 字幕整理功能 文件夹中读取所有字幕 并提番号放入对应缓存文件夹中TEMP Platform = sys.platform
config = ConfigApp() # ==========global var end=========
def UpdateCheck():
def safe_list_get(list_in, idx, default=None): if UpdateCheckSwitch() == '1':
""" html2 = get_html('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/update_check.json')
数组安全取值 html = json.loads(str(html2))
:param list_in:
:param idx: if not version == html['version']:
:param default: print('[*] * New update ' + html['version'] + ' *')
:return: print('[*] * Download *')
""" print('[*] ' + html['download'])
try: print('[*]=====================================')
return list_in[idx] else:
except IndexError: print('[+]Update Check disabled!')
return default def movie_lists():
global exclude_directory_1
global exclude_directory_2
def UpdateCheck(version): directory = config['directory_capture']['directory']
if UpdateCheckSwitch() == '1': total=[]
html2 = get_html('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/update_check.json') file_type = ['mp4','avi','rmvb','wmv','mov','mkv','flv','ts']
html = json.loads(str(html2)) exclude_directory_1 = config['common']['failed_output_folder']
exclude_directory_2 = config['common']['success_output_folder']
if not version == html['version']: if directory=='*':
print('[*] * New update ' + html['version'] + ' *') remove_total = []
print('[*] ↓ Download ↓') for o in file_type:
print('[*] ' + html['download']) remove_total += glob.glob(r"./" + exclude_directory_1 + "/*." + o)
print('[*]======================================================') remove_total += glob.glob(r"./" + exclude_directory_2 + "/*." + o)
else: for i in os.listdir(os.getcwd()):
print('[+]Update Check disabled!') for a in file_type:
total += glob.glob(r"./" + i + "/*." + a)
for b in remove_total:
def argparse_get_file(): total.remove(b)
parser = argparse.ArgumentParser() return total
parser.add_argument("file", default='', nargs='?', help="Write the file path on here") for a in file_type:
args = parser.parse_args() total += glob.glob(r"./" + directory + "/*." + a)
if args.file == '': return total
return '' def CreatFailedFolder():
else: if not os.path.exists('failed/'): # 新建failed文件夹
return args.file try:
os.makedirs('failed/')
except:
def movie_lists(escape_folders): print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)")
escape_folders = re.split('[,]', escape_folders) os._exit(0)
total = [] def lists_from_test(custom_nuber): #电影列表
a=[]
for root, dirs, files in os.walk(config.search_folder): a.append(custom_nuber)
if root in escape_folders: return a
continue def CEF(path):
for file in files: try:
if re.search(PathNameProcessor.pattern_of_file_name_suffixes, file, re.IGNORECASE): files = os.listdir(path) # 获取路径下的子文件(夹)列表
path = os.path.join(root, file) for file in files:
total.append(path) os.removedirs(path + '/' + file) # 删除这个空文件夹
return total print('[+]Deleting empty folder', path + '/' + file)
except:
a=''
# def CEF(path): def rreplace(self, old, new, *max):
# try: #从右开始替换文件名中内容,源字符串,将被替换的子字符串, 新字符串用于替换old子字符串可选字符串, 替换不超过 max 次
# files = os.listdir(path) # 获取路径下的子文件(夹)列表 count = len(self)
# for file in files: if max and str(max[0]).isdigit():
# os.removedirs(path + '/' + file) # 删除这个空文件夹 count = max[0]
# print('[+]Deleting empty folder', path + '/' + file) return new.join(self.rsplit(old, count))
# except: def getNumber(filepath):
# a = '' filepath = filepath.replace('.\\','')
# try: # 普通提取番号 主要处理包含减号-的番号
filepath = filepath.replace("_", "-")
filepath.strip('22-sht.me').strip('-HD').strip('-hd')
def get_numbers(paths): filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
"""提取对应路径的番号+集数""" try:
file_number = re.search('\w+-\d+', filename).group()
def get_number(filepath, absolute_path=False): except: # 提取类似mkbd-s120番号
""" file_number = re.search('\w+-\w+\d+', filename).group()
获取番号集数 return file_number
:param filepath: except: # 提取不含减号-的番号
:param absolute_path: try:
:return: filename = str(re.sub("ts6\d", "", filepath)).strip('Tokyo-hot').strip('tokyo-hot')
""" filename = str(re.sub(".*?\.com-\d+", "", filename)).replace('_', '')
name = filepath.upper() # 转大写 file_number = str(re.search('\w+\d{4}', filename).group(0))
if absolute_path: return file_number
name = name.replace('\\', '/') except: # 提取无减号番号
# 移除干扰字段 filename = str(re.sub("ts6\d", "", filepath)) # 去除ts64/265
name = PathNameProcessor.remove_distractions(name) filename = str(re.sub(".*?\.com-\d+", "", filename))
# 抽取 文件路径中可能存在的尾部集数,和抽取尾部集数的后的文件路径 file_number = str(re.match('\w+', filename).group())
suffix_episode, name = PathNameProcessor.extract_suffix_episode(name) file_number = str(file_number.replace(re.match("^[A-Za-z]+", file_number).group(),re.match("^[A-Za-z]+", file_number).group() + '-'))
# 抽取 文件路径中可能存在的 番号后跟随的集数 和 处理后番号 return file_number
episode_behind_code, code_number = PathNameProcessor.extract_code(name)
# 无番号 则设置空字符 def RunCore():
code_number = code_number if code_number else '' if Platform == 'win32':
# 优先取尾部集数,无则取番号后的集数(几率低),都无则为空字符 if os.path.exists('core.py'):
episode = suffix_episode if suffix_episode else episode_behind_code if episode_behind_code else '' os.system('python core.py' + ' "' + i + '" --number "' + getNumber(i) + '"') # 从py文件启动用于源码py
elif os.path.exists('core.exe'):
return code_number, episode os.system('core.exe' + ' "' + i + '" --number "' + getNumber(i) + '"') # 从exe启动用于EXE版程序
elif os.path.exists('core.py') and os.path.exists('core.exe'):
maps = {} os.system('python core.py' + ' "' + i + '" --number "' + getNumber(i) + '"') # 从py文件启动用于源码py
for path in paths: else:
number, episode = get_number(path) if os.path.exists('core.py'):
maps[path] = (number, episode) os.system('python3 core.py' + ' "' + i + '" --number "' + getNumber(i) + '"') # 从py文件启动用于源码py
elif os.path.exists('core.exe'):
return maps os.system('core.exe' + ' "' + i + '" --number "' + getNumber(i) + '"') # 从exe启动用于EXE版程序
elif os.path.exists('core.py') and os.path.exists('core.exe'):
os.system('python3 core.py' + ' "' + i + '" --number "' + getNumber(i) + '"') # 从py文件启动用于源码py
def create_folder(paths):
for path_to_make in paths: if __name__ =='__main__':
if path_to_make: print('[*]===========AV Data Capture===========')
try: print('[*] Version '+version)
os.makedirs(path_to_make) print('[*]=====================================')
except FileExistsError as e: CreatFailedFolder()
# name = f'{folder=}'.split('=')[0].split('.')[-1] UpdateCheck()
print(path_to_make + " 已经存在") os.chdir(os.getcwd())
pass
except Exception as exception: count = 0
print('! 创建文件夹 ' + path_to_make + ' 失败,文件夹路径错误或权限不够') count_all = str(len(movie_lists()))
raise exception print('[+]Find',str(len(movie_lists())),'movies')
else: for i in movie_lists(): #遍历电影列表 交给core处理
raise Exception('!创建的文件夹路径为空,请确认') count = count + 1
percentage = str(count/int(count_all)*100)[:4]+'%'
print('[!] - '+percentage+' ['+str(count)+'/'+count_all+'] -')
if __name__ == '__main__': try:
version = '2.8.2' print("[!]Making Data for [" + i + "], the number is [" + getNumber(i) + "]")
RunCore()
print('[*]================== AV Data Capture ===================') print("[*]=====================================")
print('[*] Version ' + version) except: # 番号提取异常
print('[*]======================================================') print('[-]' + i + ' Cannot catch the number :')
print('[-]Move ' + i + ' to failed folder')
# UpdateCheck(version) shutil.move(i, str(os.getcwd()) + '/' + 'failed/')
continue
CreatFailedFolder(config.failed_folder)
os.chdir(os.getcwd()) CEF(exclude_directory_1)
CEF(exclude_directory_2)
# 创建文件夹 print("[+]All finished!!!")
create_folder([config.failed_folder, config.search_folder, config.temp_folder]) input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
# temp 文件夹中infos放 番号json信息pics中放图片信息
path_infos = config.temp_folder + '/infos'
path_pics = config.temp_folder + '/pics'
create_folder([path_infos, path_pics])
# 遍历搜索目录下所有视频的路径
movie_list = movie_lists(config.escape_folder)
# 以下是从文本中提取测试的数据
# f = open('TestPathNFO.txt', 'r')
# f = open('TestPathSpecial.txt', 'r')
# movie_list = [line[:-1] for line in f.readlines()]
# f.close()
# 获取 番号,集数,路径 的字典->list
code_ep_paths = [[codeEposode[0], codeEposode[1], path] for path, codeEposode in get_numbers(movie_list).items()]
[print(i) for i in code_ep_paths]
# 按番号分组片子列表(重点),用于寻找相同番号的片子
'''
这里利用pandas分组 "https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html"
'''
# # 设置打印时显示所有列
# pd.set_option('display.max_columns', None)
# # 显示所有行
# pd.set_option('display.max_rows', None)
# # 设置value的显示长度为100默认为50
# pd.set_option('max_colwidth', 30)
# # 创建框架
# df = pd.DataFrame(code_ep_paths, columns=('code', 'ep', 'path'))
# # 以番号分组
# groupedCode_code_ep_paths = df.groupby(['code'])
# # print(df.groupby(['code', 'ep']).describe().unstack())
# grouped_code_ep = df.groupby(['code', 'ep'])['path']
#
sorted_code_list = sorted(code_ep_paths, key=lambda code_ep_path: code_ep_path[0])
group_code_list = itertools.groupby(sorted_code_list, key=lambda code_ep_path: code_ep_path[0])
def group_code_list_to_dict(group_code_list):
data_dict = {}
for code, code_ep_path_group in group_code_list:
code_ep_path_list = list(code_ep_path_group)
eps_of_code = {}
group_ep_list = itertools.groupby(code_ep_path_list, key=lambda code_ep_path: code_ep_path[1])
for ep, group_ep_group in group_ep_list:
group_ep_list = list(group_ep_group)
eps_of_code[ep] = [code_ep_path[2] for code_ep_path in group_ep_list]
data_dict[code] = eps_of_code
return data_dict
def print_same_code_ep_path(data_dict_in):
for code_in in data_dict_in:
ep_path_list = data_dict_in[code_in]
if len(ep_path_list) > 1:
print('--' * 60)
print("|" + (code_in if code_in else 'unknown') + ":")
# group_ep_list = itertools.groupby(code_ep_path_list.items(), key=lambda code_ep_path: code_ep_path[0])
for ep in ep_path_list:
path_list = ep_path_list[ep]
print('--' * 12)
ep = ep if ep else ' '
if len(path_list) == 1:
print('| 集数:' + ep + ' 文件: ' + path_list[0])
else:
print('| 集数:' + ep + ' 文件: ')
for path in path_list:
print('| ' + path)
else:
pass
# 分好组的数据 {code:{ep:[path]}}
data_dict_groupby_code_ep = group_code_list_to_dict(group_code_list)
print('--' * 100)
print("找到影片数量:" + str(len(movie_list)))
print("合计番号数量:" + str(len(data_dict_groupby_code_ep)) + " (多个相同番号的影片只统计一个,不能识别的番号 都统一为'unknown')")
print('Warning:!!!! 以下为相同番号的电影明细')
print('' + '--' * 80)
print_same_code_ep_path(data_dict_groupby_code_ep)
print('' + '--' * 80)
isContinue = input('任意键继续? N 退出 \n')
if isContinue.strip(' ') == "N":
exit(1)
# ========== 野鸡番号拖动 ==========
# number_argparse = argparse_get_file()
# if not number_argparse == '':
# print("[!]Making Data for [" + number_argparse + "], the number is [" + getNumber(number_argparse,
# absolute_path=True) + "]")
# nfo = core_main(number_argparse, getNumber(number_argparse, absolute_path=True))
# print("[*]======================================================")
# CEF(config.success_folder)
# CEF(config.failed_folder)
# print("[+]All finished!!!")
# input("[+][+]Press enter key exit, you can check the error messge before you exit.")
# os._exit(0)
# ========== 野鸡番号拖动 ==========
def download_code_infos(code_list, is_read_cache=True):
"""
遍历按番号分组的集合刮取番号信息并缓存
:param is_read_cache: 是否读取缓存数据
:param code_list:
:return: {code:nfo}
"""
count_all_grouped = len(code_list)
count = 0
code_info_dict = {}
for code in code_list:
count = count + 1
percentage = str(count / int(count_all_grouped) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + str(count_all_grouped) + '] -')
try:
print("[!]搜刮数据 [" + code + "]")
if code:
# 创建番号的文件夹
file_path = path_infos + '/' + code + '.json'
nfo = {}
# 读取缓存信息,如果没有则联网搜刮
path = Path(file_path)
if is_read_cache and (path.exists() and path.is_file() and path.stat().st_size > 0):
print('找到缓存信息')
with open(file_path) as fp:
nfo = json.load(fp)
else:
# 核心功能 - 联网抓取信息字典
print('联网搜刮')
nfo = core_main(code)
print('正在写入', end='')
# 把缓存信息写入缓存文件夹中,有时会设备占用而失败,重试即可
@retry(stop=stop_after_delay(3), wait=wait_fixed(2))
def read_file():
with open(file_path, 'w') as fp:
json.dump(nfo, fp)
read_file()
print('完成!')
# 将番号信息放入字典
code_info_dict[code] = nfo
print("[*]======================================================")
except Exception as e: # 番号的信息获取失败
code_info_dict[code] = ''
print("找不到信息:" + code + ',Reason:' + str(e))
# if config.soft_link:
# print('[-]Link', file_path_name, 'to failed folder')
# os.symlink(file_path_name, config.failed_folder + '/')
# else:
# try:
# print('[-]Move ' + file_path_name + ' to failed folder:' + config.failed_folder)
# shutil.move(file_path_name, config.failed_folder + '/')
# except FileExistsError:
# print('[!]File exists in failed!')
# except:
# print('[+]skip')
continue
return code_info_dict
print('----------------------------------')
code_infos = download_code_infos(data_dict_groupby_code_ep)
print("----未找到番号数据的番号----")
print([print(code) for code in code_infos if code_infos[code] == ''])
print("-------------------------")
def download_images_of_nfos(code_info_dict):
"""
遍历番号信息下载番号电影的海报图片
:param code_info_dict:
:return: 无图片的信息的番号
"""
code_list_empty_image = []
for code in code_info_dict:
nfo = code_info_dict[code]
if len(nfo.keys()) == 0:
code_list_empty_image.append(code)
continue
code_pics_folder_to_save = path_pics + '/' + code
# 1 创建 番号文件夹
os.makedirs(code_pics_folder_to_save, exist_ok=True)
# 下载缩略图
if nfo['imagecut'] == 3: # 3 是缩略图
path = Path(code_pics_folder_to_save + '/' + 'thumb.png')
if path.exists() and path.is_file() and path.stat().st_size > 0:
print(code + ':缩略图已有缓存')
else:
print(code + ':缩略图下载中...')
download_file(nfo['cover_small'], code_pics_folder_to_save, 'thumb.png')
print(code + ':缩略图下载完成')
# 下载海报
path = Path(code_pics_folder_to_save + '/' + 'poster.png')
if path.exists() and path.is_file() and path.stat().st_size > 0:
print(code + ':海报已有缓存')
else:
print(code + ':海报下载中...')
download_file(nfo['cover'], code_pics_folder_to_save, 'poster.png')
print(code + ':海报下载完成')
return code_list_empty_image
code_list_empty = download_images_of_nfos(code_infos)
print("----未找到集数的番号----")
print([print(code) for code in code_list_empty])
print("------搜刮未找到集数的番号------")
code_infos_of_no_ep = download_code_infos(code_list_empty, is_read_cache=False)
print("----还是未找到番号数据的番号----")
print([print(code) for code in code_infos_of_no_ep if code_infos_of_no_ep[code] == ''])
print("----------------------")
# 开始操作
# # 2 创建缩略图海报
# if nfo['imagecut'] == 3: # 3 是缩略图
# download_cover_file(nfo['cover_small'], code, code_pics_folder_to_save)
# # 3 创建图
# download_image(nfo['cover'], code, code_pics_folder_to_save)
# # 4 剪裁
# crop_image(nfo['imagecut'], code, code_pics_folder_to_save)
# # 5 背景图
# copy_images_to_background_image(code, code_pics_folder_to_save)
# 6 创建 mame.nfo(不需要需要时从infos中josn文件转为nfo文件)
# make_nfo_file(nfo, code, temp_path_to_save)
# 相同番号处理:按集数添加-CD[X];视频格式 and 大小 分;
# TODO 方式1 刮削添加nfo封面内容截图等
# 6 创建 mame.nfo(不需要需要时从infos中josn文件转为nfo文件)
make_nfo_file(nfo, code, temp_path_to_save)
# TODO 方式2 整理:按规则移动影片,字幕 到 演员,发行商,有无🐎 等
# if config.program_mode == '1':
# if multi_part == 1:
# number += part # 这时number会被附加上CD1后缀
# smallCoverCheck(path, number, imagecut, json_data['cover_small'], c_word, option, filepath, config.failed_folder) # 检查小封面
# imageDownload(option, json_data['cover'], number, c_word, path, multi_part, filepath, config.failed_folder) # creatFoder会返回番号路径
# cutImage(option, imagecut, path, number, c_word) # 裁剪图
# copyRenameJpgToBackdrop(option, path, number, c_word)
# PrintFiles(option, path, c_word, json_data['naming_rule'], part, cn_sub, json_data, filepath, config.failed_folder, tag) # 打印文件 .nfo
# pasteFileToFolder(filepath, path, number, c_word) # 移动文件
# # =======================================================================整理模式
# elif config.program_mode == '2':
# pasteFileToFolder_mode2(filepath, path, multi_part, number, part, c_word) # 移动文件
# CEF(config.success_folder)
# CEF(config.failed_folder)
print("[+]All finished!!!")
input("[+][+]Press enter key exit, you can check the error message before you exit.")

View File

@ -1,28 +0,0 @@
from configparser import ConfigParser
from MediaServer import MediaServer
class ConfigApp:
def __init__(self):
config_file = 'config.ini'
config = ConfigParser()
config.read(config_file, encoding='UTF-8')
self.success_folder = config['common']['success_output_folder']
self.failed_folder = config['common']['failed_output_folder'] # 失败输出目录
self.escape_folder = config['escape']['folders'] # 多级目录刮削需要排除的目录
self.search_folder = config['common']['search_folder'] # 搜索路径
self.temp_folder = config['common']['temp_folder'] # 临时资源路径
self.soft_link = (config['common']['soft_link'] == 1)
# self.escape_literals = (config['escape']['literals'] == 1)
self.naming_rule = config['Name_Rule']['naming_rule']
self.location_rule = config['Name_Rule']['location_rule']
self.proxy = config['proxy']['proxy']
self.timeout = float(config['proxy']['timeout'])
self.retry = int(config['proxy']['retry'])
self.media_server = MediaServer[config['media']['media_warehouse']]
self.update_check = config['update']['update_check']
self.debug_mode = config['debug_mode']['switch']

0
LICENSE Executable file → Normal file
View File

View File

@ -1,19 +0,0 @@
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
print(df)
groupedA = df.groupby('A').describe()
groupedAB = df.groupby(['A', 'B'])['C']
print('---'*18)
for a, b in groupedAB:
print('--'*18)
print(a)
print('-' * 18)
print(b)

View File

@ -1,38 +0,0 @@
import pandas as pd
import numpy as np
'''
python数据处理三剑客之一pandas
https://pandas.pydata.org/pandas-docs/stable/user_guide
https://www.pypandas.cn/docs/getting_started/10min.html
'''
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(dates)
print(df)
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
print(df2)
print(df2.dtypes)
print(df.head())
print(df.tail(5))
print(df.index)
print(df.columns)
df.describe() # 统计数据摘要
df.T # index columns互转
df.sort_index(axis=1, ascending=False) # 排序axis=1 是columnsaxis=1 是index
df.sort_values(by='B') # 按值排序 按B列中的值排序
# 切行
df.A
df['A']
# 切行
df['20130102':'20130104']
df[0:3]

View File

@ -1,28 +0,0 @@
from enum import Enum, auto
class MediaServer(Enum):
EMBY = auto()
PLEX = auto()
KODI = auto()
# media = EMBY
#
# def __init__(self, arg):
# self = [e for e in MediaServer if arg.upper() == self.name]
def poster_name(self, name):
if self == MediaServer.EMBY: # 保存[name].png
return name + '.png'
elif self == MediaServer.KODI: # 保存[name]-poster.jpg
return name + '-poster.jpg'
elif self == MediaServer.PLEX: # 保存 poster.jpg
return 'poster.jpg'
def image_name(self, name):
if self == MediaServer.EMBY: # name.jpg
return name + '.jpg'
elif self == MediaServer.KODI: # [name]-fanart.jpg
return name + '-fanart.jpg'
elif self == MediaServer.PLEX: # fanart.jpg
return 'fanart.jpg'

View File

@ -1,3 +0,0 @@
from addict import Dict
# class Metadata:

View File

@ -1,115 +0,0 @@
import re
import fuckit
class PathNameProcessor:
# 类变量
pattern_of_file_name_suffixes = r'.(mov|mp4|avi|rmvb|wmv|mov|mkv|flv|ts|m2ts)$'
# def __init__(self):
@staticmethod
def remove_distractions(origin_name):
"""移除干扰项"""
# 移除文件类型后缀
origin_name = re.sub(PathNameProcessor.pattern_of_file_name_suffixes, '', origin_name, 0, re.IGNORECASE)
# 处理包含减号-和_的番号'/-070409_621'
origin_name = re.sub(r'[-_~*# ]', "-", origin_name, 0)
origin_name = re.sub(r'(Carib)(bean)?', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'(1pondo)', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'(tokyo)[-. ]?(hot)', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'Uncensored', '-', origin_name, 0, re.IGNORECASE)
origin_name = re.sub(r'JAV', '-', origin_name, 0, re.IGNORECASE)
# 移除干扰字段
origin_name = origin_name.replace('22-sht.me', '-')
# 去除文件名中时间 1970-2099年 月 日
pattern_of_date = r'(?:-)(19[789]\d|20\d{2})(-?(0\d|1[012])-?(0[1-9]|[12]\d|3[01])?)?[-.]'
# 移除字母开头 清晰度相关度 字符
pattern_of_resolution_alphas = r'(?<![a-zA-Z])(SD|((F|U)|(Full|Ultra)[-_*. ~]?)?HD|BD|(blu[-_*. ~]?ray)|[hx]264|[hx]265|HEVC)'
# 数字开头的 清晰度相关度 字符
pattern_of_resolution_numbers = r'(?<!\d)(4K|(1080[ip])|(720p)|(480p))'
origin_name = re.sub(pattern_of_resolution_alphas, "-", origin_name, 0, re.IGNORECASE)
origin_name = re.sub(pattern_of_resolution_numbers, "-", origin_name, 0, re.IGNORECASE)
origin_name = re.sub(pattern_of_date, "-", origin_name)
if 'FC2' or 'fc2' in origin_name:
origin_name = origin_name.replace('-PPV', '').replace('PPV-', '').replace('FC2PPV-', 'FC2-').replace(
'FC2PPV_', 'FC2-')
# 移除连续重复无意义符号-
origin_name = re.sub(r"([-.])(\1+)", r"\1", origin_name)
# 移除尾部无意义符号 方便识别剧集数
origin_name = re.sub(r'[-.]+$', "", origin_name)
return origin_name
@staticmethod
def extract_suffix_episode(origin_name):
""" 提取尾部集数号 123ABC(只识别一位) part1 ipz.A CD1 NOP019B.HD.wmv"""
episode = None
with fuckit:
# 零宽断言获取尾部数字 剧集数 123
pattern_episodes_number = r'(?<!\d)\d$'
episode = re.findall(pattern_episodes_number, origin_name)[-1]
origin_name = re.sub(pattern_episodes_number, "", origin_name)
with fuckit:
# 零宽断言获取尾部字幕 剧集数 abc
pattern_episodes_alpha = r'(?<![a-zA-Z])[a-zA-Z]$'
episode = re.findall(pattern_episodes_alpha, origin_name)[-1]
origin_name = re.sub(pattern_episodes_alpha, "", origin_name)
return episode, origin_name
@staticmethod
def extract_code(origin_name):
"""
提取集数和 规范过的番号
"""
name = None
episode = None
with fuckit:
# 找到含- 或不含-的 番号1. 数字+数字 2. 字母+数字
name = re.findall(r'(?:\d{2,}-\d{2,})|(?:[A-Z]+-?[A-Z]*\d{2,})', origin_name)[-1]
episode = PathNameProcessor.extract_episode_behind_code(origin_name, name)
# 将未-的名字处理加上 -
if not ('-' in name):
# 无减号-的番号,尝试分段加上-
# 非贪婪匹配非特殊字符零宽断言后数字至少2位连续,ipz221.part2 mide072hhb ,n1180
with fuckit:
name = re.findall(r'[a-zA-Z]+\d{2,}', name)[-1]
# 比如MCDV-47 mcdv-047 是2个不一样的片子但是 SIVR-00008 和 SIVR-008是同同一部,但是heyzo除外,heyzo 是四位数
if "heyzo" not in name.lower():
name = re.sub(r'([a-zA-Z]{2,})(?:0*?)(\d{2,})', r'\1-\2', name)
# 正则取含-的番号 【字母-[字母]数字】,数字必定大于2位 番号的数组的最后的一个元素
with fuckit:
# MKBD_S03-MaRieS
name = re.findall(r'[a-zA-Z|\d]+-[a-zA-Z|\d]*\d{2,}', name)[-1]
# 107NTTR-037 -> NTTR-037 , SIVR-00008 -> SIVR-008 但是heyzo除外
if "heyzo" not in name.lower():
searched = re.search(r'([a-zA-Z]{2,})-(?:0*)(\d{3,})', name)
if searched:
name = '-'.join(searched.groups())
return episode, name
@staticmethod
def extract_episode_behind_code(origin_name, code):
episode = None
with fuckit:
# 零宽断言获取尾部字幕 剧集数 abc123
result_dict = re.search(rf'(?<={code})-?((?P<alpha>([A-Z](?![A-Z])))|(?P<num>\d(?!\d)))', origin_name,
re.I).groupdict()
episode = result_dict['alpha'] or result_dict['num']
return episode
def safe_list_get(list_in, idx, default):
try:
return list_in[idx]
except IndexError:
return default

19
Pipfile
View File

@ -1,19 +0,0 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
bs4 = "*"
tenacity = "*"
fuckit = "*"
requests = "*"
image = "*"
lazyxml = {editable = true,git = "https://github.com/waynedyck/lazyxml.git",ref = "python-3-conversion_wd1"}
lxml = "*"
pyquery = "*"
[requires]
python_version = "3.8"

246
Pipfile.lock generated
View File

@ -1,246 +0,0 @@
{
"_meta": {
"hash": {
"sha256": "15bf3c6af3ec315358a0217481a13285f95fc742bb5db8a1f934e0d1c3d7d5e2"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.8"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"asgiref": {
"hashes": [
"sha256:5ee950735509d04eb673bd7f7120f8fa1c9e2df495394992c73234d526907e17",
"sha256:7162a3cb30ab0609f1a4c95938fd73e8604f63bdba516a7f7d64b83ff09478f0"
],
"markers": "python_version >= '3.5'",
"version": "==3.3.1"
},
"beautifulsoup4": {
"hashes": [
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
],
"version": "==4.9.3"
},
"bs4": {
"hashes": [
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
],
"index": "pypi",
"version": "==0.0.1"
},
"certifi": {
"hashes": [
"sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
"sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
],
"version": "==2020.12.5"
},
"chardet": {
"hashes": [
"sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
"sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.0.0"
},
"cssselect": {
"hashes": [
"sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf",
"sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.1.0"
},
"django": {
"hashes": [
"sha256:2d78425ba74c7a1a74b196058b261b9733a8570782f4e2828974777ccca7edf7",
"sha256:efa2ab96b33b20c2182db93147a0c3cd7769d418926f9e9f140a60dca7c64ca9"
],
"markers": "python_version >= '3.6'",
"version": "==3.1.5"
},
"fuckit": {
"hashes": [
"sha256:059488e6aa2053da9db5eb5101e2498f608314da5118bf2385acb864568ccc25"
],
"index": "pypi",
"version": "==4.8.1"
},
"idna": {
"hashes": [
"sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
"sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.10"
},
"image": {
"hashes": [
"sha256:baa2e09178277daa50f22fd6d1d51ec78f19c12688921cb9ab5808743f097126"
],
"index": "pypi",
"version": "==1.5.33"
},
"lazyxml": {
"editable": true,
"git": "https://github.com/waynedyck/lazyxml.git",
"ref": "f42ea4a4febf4c1e120b05d6ca9cef42556a75d5"
},
"lxml": {
"hashes": [
"sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
"sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
"sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
"sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
"sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
"sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
"sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
"sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
"sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
"sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
"sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
"sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
"sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
"sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
"sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
"sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
"sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
"sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
"sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
"sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
"sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
"sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
"sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
"sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
"sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
"sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
"sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
"sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
"sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
"sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
"sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
"sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
"sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
"sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
"sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
"sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
"sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
],
"index": "pypi",
"version": "==4.6.2"
},
"pillow": {
"hashes": [
"sha256:165c88bc9d8dba670110c689e3cc5c71dbe4bfb984ffa7cbebf1fac9554071d6",
"sha256:1d208e670abfeb41b6143537a681299ef86e92d2a3dac299d3cd6830d5c7bded",
"sha256:22d070ca2e60c99929ef274cfced04294d2368193e935c5d6febfd8b601bf865",
"sha256:2353834b2c49b95e1313fb34edf18fca4d57446675d05298bb694bca4b194174",
"sha256:39725acf2d2e9c17356e6835dccebe7a697db55f25a09207e38b835d5e1bc032",
"sha256:3de6b2ee4f78c6b3d89d184ade5d8fa68af0848f9b6b6da2b9ab7943ec46971a",
"sha256:47c0d93ee9c8b181f353dbead6530b26980fe4f5485aa18be8f1fd3c3cbc685e",
"sha256:5e2fe3bb2363b862671eba632537cd3a823847db4d98be95690b7e382f3d6378",
"sha256:604815c55fd92e735f9738f65dabf4edc3e79f88541c221d292faec1904a4b17",
"sha256:6c5275bd82711cd3dcd0af8ce0bb99113ae8911fc2952805f1d012de7d600a4c",
"sha256:731ca5aabe9085160cf68b2dbef95fc1991015bc0a3a6ea46a371ab88f3d0913",
"sha256:7612520e5e1a371d77e1d1ca3a3ee6227eef00d0a9cddb4ef7ecb0b7396eddf7",
"sha256:7916cbc94f1c6b1301ac04510d0881b9e9feb20ae34094d3615a8a7c3db0dcc0",
"sha256:81c3fa9a75d9f1afafdb916d5995633f319db09bd773cb56b8e39f1e98d90820",
"sha256:887668e792b7edbfb1d3c9d8b5d8c859269a0f0eba4dda562adb95500f60dbba",
"sha256:93a473b53cc6e0b3ce6bf51b1b95b7b1e7e6084be3a07e40f79b42e83503fbf2",
"sha256:96d4dc103d1a0fa6d47c6c55a47de5f5dafd5ef0114fa10c85a1fd8e0216284b",
"sha256:a3d3e086474ef12ef13d42e5f9b7bbf09d39cf6bd4940f982263d6954b13f6a9",
"sha256:b02a0b9f332086657852b1f7cb380f6a42403a6d9c42a4c34a561aa4530d5234",
"sha256:b09e10ec453de97f9a23a5aa5e30b334195e8d2ddd1ce76cc32e52ba63c8b31d",
"sha256:b6f00ad5ebe846cc91763b1d0c6d30a8042e02b2316e27b05de04fa6ec831ec5",
"sha256:bba80df38cfc17f490ec651c73bb37cd896bc2400cfba27d078c2135223c1206",
"sha256:c3d911614b008e8a576b8e5303e3db29224b455d3d66d1b2848ba6ca83f9ece9",
"sha256:ca20739e303254287138234485579b28cb0d524401f83d5129b5ff9d606cb0a8",
"sha256:cb192176b477d49b0a327b2a5a4979552b7a58cd42037034316b8018ac3ebb59",
"sha256:cdbbe7dff4a677fb555a54f9bc0450f2a21a93c5ba2b44e09e54fcb72d2bd13d",
"sha256:cf6e33d92b1526190a1de904df21663c46a456758c0424e4f947ae9aa6088bf7",
"sha256:d355502dce85ade85a2511b40b4c61a128902f246504f7de29bbeec1ae27933a",
"sha256:d673c4990acd016229a5c1c4ee8a9e6d8f481b27ade5fc3d95938697fa443ce0",
"sha256:dc577f4cfdda354db3ae37a572428a90ffdbe4e51eda7849bf442fb803f09c9b",
"sha256:dd9eef866c70d2cbbea1ae58134eaffda0d4bfea403025f4db6859724b18ab3d",
"sha256:f50e7a98b0453f39000619d845be8b06e611e56ee6e8186f7f60c3b1e2f0feae"
],
"markers": "python_version >= '3.6'",
"version": "==8.1.0"
},
"pyquery": {
"hashes": [
"sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963",
"sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72"
],
"index": "pypi",
"version": "==1.4.3"
},
"pytz": {
"hashes": [
"sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4",
"sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"
],
"version": "==2020.5"
},
"requests": {
"hashes": [
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
"sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
],
"index": "pypi",
"version": "==2.25.1"
},
"six": {
"hashes": [
"sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
"sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.15.0"
},
"soupsieve": {
"hashes": [
"sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851",
"sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"
],
"markers": "python_version >= '3.0'",
"version": "==2.1"
},
"sqlparse": {
"hashes": [
"sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
"sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
],
"markers": "python_version >= '3.5'",
"version": "==0.4.1"
},
"tenacity": {
"hashes": [
"sha256:baed357d9f35ec64264d8a4bbf004c35058fad8795c5b0d8a7dc77ecdcbb8f39",
"sha256:e14d191fb0a309b563904bbc336582efe2037de437e543b38da749769b544d7f"
],
"index": "pypi",
"version": "==6.3.1"
},
"urllib3": {
"hashes": [
"sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08",
"sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.2"
}
},
"develop": {}
}

389
README.md Executable file → Normal file
View File

@ -1,263 +1,223 @@
# AV Data Capture (CLI) # AV Data Capture
CLI 版本
<a title="Hits" target="_blank" href="https://github.com/yoshiko2/AV_Data_Capture"><img src="https://hits.b3log.org/yoshiko2/AV_Data_Capture.svg"></a> <a title="Hits" target="_blank" href="https://github.com/yoshiko2/AV_Data_Capture"><img src="https://hits.b3log.org/yoshiko2/AV_Data_Capture.svg"></a>
![](https://img.shields.io/badge/build-passing-brightgreen.svg?style=flat-square) ![](https://img.shields.io/badge/build-passing-brightgreen.svg?style=flat-square)
![](https://img.shields.io/github/downloads/yoshiko2/av_data_capture/total.svg?style=flat-square) ![](https://img.shields.io/github/downloads/yoshiko2/av_data_capture/total.svg?style=flat-square)<br>
![](https://img.shields.io/github/license/yoshiko2/av_data_capture.svg?style=flat-square) ![](https://img.shields.io/github/license/yoshiko2/av_data_capture.svg?style=flat-square)
![](https://img.shields.io/github/release/yoshiko2/av_data_capture.svg?style=flat-square) ![](https://img.shields.io/github/release/yoshiko2/av_data_capture.svg?style=flat-square)
![](https://img.shields.io/badge/Python-3.7-yellow.svg?style=flat-square&logo=python)<br> ![](https://img.shields.io/badge/Python-3.7-yellow.svg?style=flat-square&logo=python)<br>
[GUI 版本](https://github.com/moyy996/AVDC)
<a title="Hits" target="_blank" href="https://github.com/moyy996/avdc"><img src="https://hits.b3log.org/moyy996/AVDC.svg"></a>
![](https://img.shields.io/badge/build-passing-brightgreen.svg?style=flat-square)
![](https://img.shields.io/github/downloads/moyy996/avdc/total.svg?style=flat-square)
![](https://img.shields.io/github/license/moyy996/avdc.svg?style=flat-square)
![](https://img.shields.io/github/release/moyy996/avdc.svg?style=flat-square)
![](https://img.shields.io/badge/Python-3.6-yellow.svg?style=flat-square&logo=python)
![](https://img.shields.io/badge/Pyqt-5-blue.svg?style=flat-square)<br>
**日本电影元数据 抓取工具 | 刮削器**,配合本地影片管理软件 Emby, Jellyfin, Kodi 等管理本地影片该软件起到分类与元数据metadata抓取作用利用元数据信息来分类供本地影片分类整理使用。 **日本电影元数据 抓取工具 | 刮削器**配合本地影片管理软件EMBY,KODI等管理本地影片该软件起到分类与元数据抓取作用利用元数据信息来分类供本地影片分类整理使用。
##### 本地电影刮削与整理一体化解决方案
# 目录 # 目录
* [声明](#声明) * [声明](#声明)
* [FAQ](#FAQ) * [你问我答 FAQ](#你问我答-faq)
* [故事](#故事) * [故事](#故事)
* [效果图](#效果图) * [效果图](#效果图)
* [如何使用](#如何使用) * [如何使用](#如何使用)
* [下载](#下载) * [下载](#下载)
* [简要教程](#简要教程) * [简明教程](#简要教程)
* [完整文档](#完整文档) * [模块安装](#1请安装模块在cmd终端逐条输入以下命令安装)
* [模块安装](#模块安装) * [配置](#2配置configini)
* [配置](#配置configini) * [(可选)设置自定义目录和影片重命名规则](#3可选设置自定义目录和影片重命名规则)
* [多目录影片处理](#多目录影片处理) * [运行软件](#5运行-av_data_capturepyexe)
* [多集影片处理](#多集影片处理) * [影片原路径处理](#4建议把软件拷贝和电影的统一目录下)
* [中文字幕处理](#中文字幕处理) * [异常处理(重要)](#51异常处理重要)
* [异常处理(重要)](#异常处理重要) * [导入至媒体库](#7把jav_output文件夹导入到embykodi中等待元数据刷新完成)
* [写在后面](#写在后面) * [关于群晖NAS](#8关于群晖NAS)
* [写在后面](#9写在后面)
# 声明 # 声明
* 本软件仅供**技术交流,学术交流**使用 * 本软件仅供**技术交流,学术交流**使用<br>
* 本软件作者编写出该软件旨在学习 Python ,提高编程水平 * 本软件作者编写出该软件旨在学习Python3提高编程水平<br>
* 用户在使用本软件前,请用户自觉遵守当地法律法规,如果本软件使用过程中存在违反当地法律法规的行为,请勿使用该软件 * 用户在使用该软件前,请用户自觉遵守当地法律法规,如果该软件使用过程中存在违反当地法律法规的行为,请勿使用该软件<br>
* 用户在使用本软件时,若产生一切违法行为由用户承担 * 用户使用该软件时,若产生一切违法行为由用户承担<br>
* 严禁用户将本软件使用于商业和个人其他意图 * 严禁用户使用于商业和个人其他意图<br>
* 本软件作者保留最终决定权和最终解释权 * 本软件作者保留最终决定权和最终解释权<br>
**若用户不同意上述条款任意一条,请勿使用本软件** **若用户不同意上述条款任意一条,请勿使用该软件**<br>
# FAQ
### 软件能下片吗? # 你问我答 FAQ
* 本软件不提供任何影片下载地址,仅供本地影片分类整理使用 ### 这软件能下片吗?
### 什么是元数据metadata * 该软件不提供任何影片下载地址,仅供本地影片分类整理使用。
* 元数据包括了影片的封面,导演,演员,简介,类型...... ### 什么是元数据?
* 元数据包括了影片的:封面,导演,演员,简介,类型......
### 软件收费吗? ### 软件收费吗?
* 本软件永久免费,**除了作者<ruby><rt>yìng</rt></ruby>点以外** * 软件永久免费。**除了作者钦点以外**
### 软件运行异常怎么办? ### 软件运行异常怎么办?
* 认真看 [异常处理(重要)](#异常处理重要) * 认真看 [异常处理(重要)](#5异常处理重要)
### 为什么软件要单线程运行?
* 多线程爬取可能会触发网站反爬机制,同时也违背了些道德,故单线程运行 # 故事
[点击跳转至作者博客文章](https://yoshiko2.github.io/2019/10/18/AVDC/)
# 效果图 # 效果图
**图片来自网络**图片仅供参考,具体效果请自行联想 **图片来自网络**由于相关法律法规,具体效果请自行联想
![preview_picture_1](https://i.loli.net/2019/07/04/5d1cf9bb1b08b86592.jpg) ![](https://i.loli.net/2019/07/04/5d1cf9bb1b08b86592.jpg)
![preview_picture_2](https://i.loli.net/2019/07/04/5d1cf9bb2696937880.jpg) ![](https://i.loli.net/2019/07/04/5d1cf9bb2696937880.jpg)<br>
# 如何使用 # 如何使用
## 下载 ### 下载
* release的程序可脱离**python环境**运行,可跳过 [模块安装](#模块安装) * release的程序可脱离**python环境**运行,可跳过 [模块安装](#1请安装模块在cmd终端逐条输入以下命令安装)<br>Release 下载地址(**仅限Windows**):<br>[![](https://img.shields.io/badge/%E4%B8%8B%E8%BD%BD-windows-blue.svg?style=for-the-badge&logo=windows)](https://github.com/yoshiko2/AV_Data_Capture/releases)<br>
### Windows * Linux,MacOS请下载源码包运行
Release 下载地址(**仅限Windows**):
[![](https://img.shields.io/badge/%E4%B8%8B%E8%BD%BD-windows-blue.svg?style=for-the-badge&logo=windows)](https://github.com/yoshiko2/AV_Data_Capture/releases) * Windows Python环境:[点击前往](https://www.python.org/downloads/windows/) 选中executable installer下载
* MacOS Python环境[点击前往](https://www.python.org/downloads/mac-osx/)
* 若 Windows 用户需要运行源代码版本,请安装 Windows Python 环境:[点击前往](https://www.python.org/downloads/windows/) 选中 executable installer 下载 * Linux Python环境Linux用户懂的吧不解释下载地址
### 简要教程:<br>
### MacOS, Linux **1.把软件拉到和电影的同一目录<br>2.设置ini文件的代理路由器拥有自动代理功能的可以把proxy=后面内容去掉)<br>3.运行软件等待完成<br>4.把JAV_output导入至KODI,EMBY中。<br>详细请看以下教程**<br>
* MacOS, Linux 用户请下载源码包运行
* MacOS Python环境开箱即用[可选安装最新版本](https://docs.brew.sh/Homebrew-and-Python)
* Linux Python环境开箱即用可选安装最新版本恕 Linux 版本众多请自行搜索
## 简要教程:
1. 把软件拉到和电影的同一目录
2. 设置 config.ini 文件的代理(路由器拥有自动代理功能的可以把 proxy= 后面内容去掉)
3. 运行软件等待完成
4. 把 JAV_output 导入至 Kodi, Emby, Jellyfin 中。
详细请看以下完整文档
# 完整文档
## 模块安装
如果运行**源码**版,运行前请安装**Python环境**和安装以下**模块**
在终端 cmd/Powershell/Terminal 中输入以下代码来安装模块
## 1.请安装模块,在CMD/终端逐条输入以下命令安装
```python ```python
pip install requests pyquery lxml Beautifulsoup4 pillow pip install requests
``` ```
###
```python
pip install pyquery
```
###
```python
pip install lxml
```
###
```python
pip install Beautifulsoup4
```
###
```python
pip install pillow
```
###
## 配置config.ini ## 2.配置config.ini
### 运行模式 >[common]<br>
``` >main_mode=1<br>
[common] >failed_output_folder=failed<br>
main_mode=1 >success_output_folder=JAV_output<br>
``` >
1为普通模式 >[proxy]<br>
>proxy=127.0.0.1:1080<br>
>timeout=10<br>
>retry=3<br>
>
>[Name_Rule]<br>
>location_rule=actor+'/'+number<br>
>naming_rule=number+'-'+title<br>
>
>[update]<br>
>update_check=1<br>
>
>[media]<br>
>media_warehouse=emby<br>
>#emby or plex<br>
>
>[directory_capture]<br>
>directory=<br>
---
#### 运行模式
>[common]<br>
>main_mode=1<br>
1为普通模式<br>
2为整理模式仅根据女优把电影命名为番号并分类到女优名称的文件夹下 2为整理模式仅根据女优把电影命名为番号并分类到女优名称的文件夹下
``` >failed_output_folder=failed<br>
success_output_folder=JAV_outputd >success_output_folder=JAV_outputd<br>
failed_output_folder=failed
```
设置成功输出目录和失败输出目录 设置成功输出目录和失败输出目录
---
#### 软链接
方便PT下载完既想刮削又想继续上传的仓鼠党同志
```
[common]
soft_link=0
```
1为开启软链接模式
0为关闭
--- ---
### 网络设置 ### 网络设置
``` #### * 针对“某些地区”的代理设置
[proxy] 打开```config.ini```,在```[proxy]```下的```proxy```行设置本地代理地址和端口支持Shadowxxxx/X,V2XXX本地代理端口:<br>
proxy=127.0.0.1:1081 例子:```proxy=127.0.0.1:1080```<br>素人系列抓取建议使用日本代理<br>
timeout=10 **路由器拥有自动代理功能的可以把proxy=后面内容去掉**<br>
retry=3 **本地代理软件开全局模式的同志同上**<br>
``` **如果遇到tineout错误可以把文件的proxy=后面的地址和端口删除并开启vpn全局模式或者重启电脑vpn网卡**<br>
#### 针对某些地区的代理设置
```
proxy=127.0.0.1:1081
```
打开```config.ini```,在```[proxy]```下的```proxy```行设置本地代理地址和端口支持Shadowxxxx/X,V2XXX本地代理端口
素人系列抓取建议使用日本代理
**路由器拥有自动代理功能的可以把proxy=后面内容去掉**
**本地代理软件开全局模式的用户同上**
**如果遇到tineout错误可以把文件的proxy=后面的地址和端口删除,并开启代理软件全局模式,或者重启电脑,代理软件,网卡**
---
#### 连接超时重试设置 #### 连接超时重试设置
``` >[proxy]<br>
timeout=10 >timeout=10<br>
```
10为超时重试时间 单位:秒 10为超时重试时间 单位:秒
--- ---
#### 连接重试次数设置 #### 连接重试次数设置
``` >[proxy]<br>
retry=3 >retry=3<br>
```
3即为重试次数 3即为重试次数
--- ---
#### 检查更新开关 #### 检查更新开关
``` >[update]<br>
[update] >update_check=1<br>
update_check=1
```
0为关闭1为开启不建议关闭 0为关闭1为开启不建议关闭
--- ---
### 媒体库选择 ##### 媒体库选择
``` >[media]<br>
[media] >media_warehouse=emby<br>
media_warehouse=emby >#emby or plex<br>
#emby plex kodi
``` 可选择emby, plex<br>
可选择emby, plex, kodi
如果是PLEX请安装插件```XBMCnfoMoviesImporter``` 如果是PLEX请安装插件```XBMCnfoMoviesImporter```
--- ---
### 排除指定字符和目录 #### 抓取目录选择
``` >[directory_capture]<br>
[escape] >directory=<br>
literals=\ 如果directory后面为空则抓取和程序同一目录下的影片设置为``` * ```可抓取软件所在目录下的所有子目录中的影片<br>如果出错请不要加*
folders=failed,JAV_output
```
```literals=``` 标题指定字符删除,例如```iterals=\()```,则删除标题中```\()```字符
```folders=``` 指定目录,例如```folders=failed,JAV_output```多目录刮削时跳过failed,JAV_output
--- ---
### 调试模式 #### 调试模式
``` >[debug_mode]<br>switch=1<br>
[debug_mode]
switch=1
```
如要开启调试模式,请手动输入以上代码到```config.ini```中,开启后可在抓取中显示影片元数据 如要开启调试模式,请手动输入以上代码到```config.ini```中,开启后可在抓取中显示影片元数据
---
### (可选)设置自定义目录和影片重命名规则 ### 3.(可选)设置自定义目录和影片重命名规则
``` >[Name_Rule]<br>
[Name_Rule] >location_rule=actor+'/'+number<br>
location_rule=actor+'/'+number >naming_rule=number+'-'+title<br>
naming_rule=number+'-'+title
```
已有默认配置 已有默认配置
--- ---
#### 命名参数 #### 命名参数
``` >title = 片名<br>
title = 片名 >actor = 演员<br>
actor = 演员 >studio = 公司<br>
studio = 公司 >director = 导演<br>
director = 导演 >release = 发售日<br>
release = 发售日 >year = 发行年份<br>
year = 发行年份 >number = 番号<br>
number = 番号 >cover = 封面链接<br>
cover = 封面链接 >tag = 类型<br>
tag = 类型 >outline = 简介<br>
outline = 简介 >runtime = 时长<br>
runtime = 时长
```
上面的参数以下都称之为**变量** 上面的参数以下都称之为**变量**
#### 例子: #### 例子:
自定义规则方法:有两种元素,变量和字符,无论是任何一种元素之间连接必须要用加号 **+** ,比如:```'naming_rule=['+number+']-'+title```,其中冒号 ' ' 内的文字是字符,没有冒号包含的文字是变量,元素之间连接必须要用加号 **+** 自定义规则方法:有两种元素,变量和字符,无论是任何一种元素之间连接必须要用加号 **+** ,比如:```'naming_rule=['+number+']-'+title```,其中冒号 ' ' 内的文字是字符,没有冒号包含的文字是变量,元素之间连接必须要用加号 **+** <br>
目录结构规则:默认 ```location_rule=actor+'/'+number```<br> **不推荐修改时在这里添加title**有时title过长因为Windows API问题抓取数据时新建文件夹容易出错。<br>
目录结构规则:默认 ```location_rule=actor+'/'+number``` 影片命名规则:默认 ```naming_rule=number+'-'+title```<br> **在EMBY,KODI等本地媒体库显示的标题不影响目录结构下影片文件的命名**,依旧是 番号+后缀。
**不推荐修改时在这里添加 title**,有时 title 过长,因为 Windows API 问题,抓取数据时新建文件夹容易出错。
影片命名规则:默认 ```naming_rule=number+'-'+title```
**在 Emby, Kodi等本地媒体库显示的标题不影响目录结构下影片文件的命名**,依旧是 番号+后缀。
---
### 更新开关 ### 更新开关
``` >[update]<br>update_check=1<br>
[update]
update_check=1
```
1为开0为关 1为开0为关
## 多目录影片处理 ## 4.建议把软件拷贝和电影的统一目录下
可以在多个有影片目录的父目录下搜索影片后缀,然后剪切到和程序同一目录下 如果```config.ini```中```directory=```后面为空的情况下
## 5.运行 ```AV_Data_capture.py/.exe```
## 多集影片处理 当文件名包含:<br>
**建议使用视频合并合并为一个视频文件**
可以把多集电影按照集数后缀命名为类似```ssni-xxx-cd1.mp4m,ssni-xxx-cd2.mp4abp-xxx-CD1.mp4```的规则,只要含有```-CDn./-cdn.```类似命名规则,即可使用分集功能
## 中文字幕处理
运行 ```AV_Data_capture.py/.exe```
当文件名包含:
中文,字幕,-c., -C., 处理元数据时会加上**中文字幕**标签 中文,字幕,-c., -C., 处理元数据时会加上**中文字幕**标签
## 5.1 异常处理(重要)
## 异常处理(重要)
### 请确保软件是完整地确保ini文件内容是和下载提供ini文件内容的一致的 ### 请确保软件是完整地确保ini文件内容是和下载提供ini文件内容的一致的
--- ---
### 关于软件打开就闪退 ### 关于软件打开就闪退
@ -267,60 +227,47 @@ update_check=1
### 关于 ```Updata_check``` 和 ```JSON``` 相关的错误 ### 关于 ```Updata_check``` 和 ```JSON``` 相关的错误
跳转 [网络设置](#网络设置) 跳转 [网络设置](#网络设置)
---
### 关于字幕文件移动功能
字幕文件前缀必须与影片文件前缀一致,才可以使用该功能
--- ---
### 关于```FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'JAV_output''``` ### 关于```FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'JAV_output''```
在软件所在文件夹下新建 JAV_output 文件夹,可能是你没有把软件拉到和电影的同一目录 在软件所在文件夹下新建 JAV_output 文件夹,可能是你没有把软件拉到和电影的同一目录
--- ---
### 关于连接拒绝的错误 ### 关于连接拒绝的错误
请设置好[代理](#针对某些地区的代理设置) 请设置好[代理](#针对某些地区的代理设置)<br>
--- ---
### 关于Nonetype,xpath报错 ### 关于Nonetype,xpath报错
同上 同上<br>
--- ---
### 关于番号提取失败或者异常 ### 关于番号提取失败或者异常
**目前可以提取元素的影片:JAVBUS上有元数据的电影素人系列:300Maan,259luxu,siro等,FC2系列** **目前可以提取元素的影片:JAVBUS上有元数据的电影素人系列:300Maan,259luxu,siro等,FC2系列**<br>
>下一张图片来自Pockies的blog 原作者已授权<br>
>下一张图片来自 Pockies 的 blog 原作者已授权
![](https://raw.githubusercontent.com/Pockies/pic/master/741f9461gy1g1cxc31t41j20i804zdgo.jpg) ![](https://raw.githubusercontent.com/Pockies/pic/master/741f9461gy1g1cxc31t41j20i804zdgo.jpg)
目前作者已经完善了番号提取机制,功能较为强大,可提取上述文件名的的番号,如果出现提取失败或者异常的情况,请用以下规则命名 目前作者已经完善了番号提取机制,功能较为强大,可提取上述文件名的的番号,如果出现提取失败或者异常的情况,请用以下规则命名<br>
**妈蛋不要喂软件那么多野鸡片子,不让软件好好活了,操**
``` ```
COSQ-004.mp4 COSQ-004.mp4
``` ```
针对 **野鸡番号** 你需要把文件名命名为与抓取网站提供的番号一致文件拓展名除外然后把文件拖拽至core.exe/.py<br>
**野鸡番号**:比如 ```XXX-XXX-1```, ```1301XX-MINA_YUKA``` 这种**野鸡**番号在javbus等资料库存在的作品。<br>**重要**:除了 **影片文件名** ```XXXX-XXX-C```,后面这种-C的是指电影有中文字幕<br>
条件:文件名中间要有下划线或者减号"_","-",没有多余的内容只有番号为最佳,可以让软件更好获取元数据 条件:文件名中间要有下划线或者减号"_","-",没有多余的内容只有番号为最佳,可以让软件更好获取元数据
对于多影片重命名,可以用 [ReNamer](http://www.den4b.com/products/renamer) 来批量重命名 对于多影片重命名,可以用[ReNamer](http://www.den4b.com/products/renamer)来批量重命名<br>
--- ---
### 关于PIL/image.py ### 关于PIL/image.py
暂时无解可能是网络问题或者pillow模块打包问题你可以用源码运行要安装好第一步的模块 暂时无解可能是网络问题或者pillow模块打包问题你可以用源码运行要安装好第一步的模块
### 拖动法
针对格式比较奇葩的番号
影片放在和程序同一目录下,拖动至```AV_Data_Capture.exe```,即可完成刮削和整理
### 软件会自动把元数据获取成功的电影移动到 JAV_output 文件夹中根据演员分类失败的电影移动到failed文件夹中。 ## 6.软件会自动把元数据获取成功的电影移动到JAV_output文件夹中根据演员分类失败的电影移动到failed文件夹中。
## 7.把JAV_output文件夹导入到EMBY,KODI中等待元数据刷新完成
### 把JAV_output文件夹导入到 Emby, Kodi中等待元数据刷新完成 ## 8.关于群晖NAS
开启SMB在Windows上挂载为网络磁盘即可使用本软件也适用于其他NAS
### 关于群晖NAS ## 9.写在后面
开启 SMB并在 Windows 上挂载为网络磁盘即可使用本软件,也适用于其他 NAS 怎么样,看着自己的日本电影被这样完美地管理,是不是感觉成就感爆棚呢?<br>
**tg官方电报群:[ 点击进群](https://t.me/AV_Data_Capture_Official)**<br>
## 写在后面
怎么样,看着自己的日本电影被这样完美地管理,是不是感觉成就感爆棚呢?
**tg官方电报群:[ 点击进群](https://t.me/joinchat/J54y1g3-a7nxJ_-WS4-KFQ)**

View File

@ -1,229 +0,0 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import re
from lxml import etree
from ADC_function import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(text):
html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[@id="title"]/text()')[0]
return result
def getActor(text):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(text, etree.HTMLParser())
result = (
str(
html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
)
)
.strip(" ['']")
.replace("', '", ",")
)
return result
def getStudio(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
)[0]
return result
def getRuntime(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group()
def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0]
return result
def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0]
return result
def getYear(getRelease):
try:
result = str(re.search(r"\d{4}", getRelease).group())
return result
except:
return getRelease
def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
return result
def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
)
except:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
)
return result
def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser())
cover_number = number
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f")
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# (TODO) handle more edge case
# print(html)
# raise exception here, same behavior as before
# people's major requirement is fetching the picture
raise ValueError("can not find image")
return result
def getDirector(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0]
return result
def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser())
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", ""
)
if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
"\n", ""
)
except:
# (TODO) handle more edge case
# print(html)
return ""
return result
def main(number):
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
]
chosen_url = ""
for url in fanza_urls:
chosen_url = url + fanza_search_number
htmlcode = get_html(chosen_url)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
return json.dumps({"title": "",})
try:
# for some old page, the input number does not match the page
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
fanza_hinban = getNum(htmlcode)
data = {
"title": getTitle(htmlcode).strip(getActor(htmlcode)),
"studio": getStudio(htmlcode),
"outline": getOutline(htmlcode),
"runtime": getRuntime(htmlcode),
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
"release": getRelease(htmlcode),
"number": fanza_hinban,
"cover": getCover(htmlcode, fanza_hinban),
"imagecut": 1,
"tag": getTag(htmlcode),
"label": getLabel(htmlcode),
"year": getYear(
getRelease(htmlcode)
), # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
}
except:
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) # .encode('UTF-8')
return js
if __name__ == "__main__":
# print(main("DV-1562"))
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
# print(main("ipx292"))
pass

View File

@ -1,162 +0,0 @@
import re
from lxml import etree#need install
import json
import ADC_function
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(htmlcode): #获取厂商
#print(htmlcode)
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']")
result2 = str(re.sub('\D{2}2-\d+','',result)).replace(' ','',1)
#print(result2)
return result2
def getActor(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[5]/a/text()')).strip(" ['']")
return result
except:
return ''
def getStudio(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
return result
def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
#print(result)
return result
def getRelease(htmlcode2): #
#a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result
def getCover(htmlcode,number,htmlcode2): #获取厂商 #
#a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php')
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
if result == '':
html = etree.fromstring(htmlcode, etree.HTMLParser())
result2 = str(html.xpath('//*[@id="slider"]/ul[1]/li[1]/img/@src')).strip(" ['']")
return 'https://fc2club.com' + result2
return 'http:' + result
def getOutline(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result
def getTag(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()'))
return result.strip(" ['']").replace("'",'').replace(' ','')
def getYear(release):
try:
result = re.search('\d{4}',release).group()
return result
except:
return ''
def getTitle_fc2com(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
return result
def getActor_fc2com(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
return result
except:
return ''
def getStudio_fc2com(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
return result
except:
return ''
def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result
def getRelease_fc2com(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result
def getCover_fc2com(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
return 'http:' + result
def getOutline_fc2com(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result
def getTag_fc2com(number): #获取番号
htmlcode = str(bytes(ADC_function.get_html('http://adult.contents.fc2.com/api/v4/article/'+number+'/tag?'),'utf-8').decode('unicode-escape'))
result = re.findall('"tag":"(.*?)"', htmlcode)
return result
def getYear_fc2com(release):
try:
result = re.search('\d{4}',release).group()
return result
except:
return ''
def main(number):
try:
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/'+number+'/')
htmlcode = ADC_function.get_html('https://fc2club.com//html/FC2-' + number + '.html')
actor = getActor(htmlcode)
if getActor(htmlcode) == '':
actor = 'FC2系列'
dic = {
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'year': '',#str(re.search('\d{4}',getRelease(number)).group()),
'outline': '',#getOutline(htmlcode2),
'runtime': getYear(getRelease(htmlcode)),
'director': getStudio(htmlcode),
'actor': actor,
'release': getRelease(number),
'number': 'FC2-'+number,
'label': '',
'cover': getCover(htmlcode,number,htmlcode2),
'imagecut': 0,
'tag': getTag(htmlcode),
'actor_photo':'',
'website': 'https://fc2club.com//html/FC2-' + number + '.html',
'source':'https://fc2club.com//html/FC2-' + number + '.html',
}
if dic['title'] == '':
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/',cookies={'wei6H':'1'})
actor = getActor(htmlcode)
if getActor(htmlcode) == '':
actor = 'FC2系列'
dic = {
'title': getTitle_fc2com(htmlcode2),
'studio': getStudio_fc2com(htmlcode2),
'year': '', # str(re.search('\d{4}',getRelease(number)).group()),
'outline': getOutline_fc2com(htmlcode2),
'runtime': getYear_fc2com(getRelease(htmlcode2)),
'director': getStudio_fc2com(htmlcode2),
'actor': actor,
'release': getRelease_fc2com(number),
'number': 'FC2-' + number,
'cover': getCover_fc2com(htmlcode2),
'imagecut': 0,
'tag': getTag_fc2com(number),
'label': '',
'actor_photo': '',
'website': 'http://adult.contents.fc2.com/article/' + number + '/',
'source': 'http://adult.contents.fc2.com/article/' + number + '/',
}
except Exception as e:
# (TODO) better handle this
# print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
return js
#print(main('1252953'))

View File

@ -1,123 +0,0 @@
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath("/html/body/section/div/h2/strong/text()")[0]
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ')
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
d={}
for i in a:
p={i:''}
d.update(p)
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+')
def getYear(getRelease):
try:
result = str(re.search('\d{4}', getRelease).group())
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',')
def getCover_small(a, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath("//div[@class='column column-video-cover']/a/img/@src")).strip(" ['']")
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result
def main(number):
try:
number = number.upper()
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one
urls = html.xpath('//*[@id="videos"]/div/div/a/@href')
ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()')
correct_url = urls[ids.index(number)]
detail_page = get_html('https://javdb.com' + correct_url)
dic = {
'actor': getActor(detail_page),
'title': getTitle(detail_page),
'studio': getStudio(detail_page),
'outline': getOutline(detail_page),
'runtime': getRuntime(detail_page),
'director': getDirector(detail_page),
'release': getRelease(detail_page),
'number': getNum(detail_page),
'cover': getCover(detail_page),
'cover_small': getCover_small(query_result, index=ids.index(number)),
'imagecut': 3,
'tag': getTag(detail_page),
'label': getLabel(detail_page),
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(getActor(detail_page)),
'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py',
}
except Exception as e:
# print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
#print(main('ipx-292'))

View File

@ -1,41 +0,0 @@
/Volumes/Adult/Files/ノ瀬アメリ/Tokyo Hot N0646.avi
/Volumes/Adult/Files/ノ瀬アメリ/MKBD_S03-MaRieS.mp4
/Volumes/192.168.2.100/Adult/Files/Aki Sasaki Megapack/HODV-21299.mkv
/Volumes/Adult/Files/[Tokyo-Hot] [n1180] 美人秘書3穴串刺奉仕残業 (中井綾香 Ayaka Nakai)/(Tokyo-Hot)(n1180)美人秘書3穴串刺奉仕残業 中井綾香.mp4
/mcdv47.avi
/mcdv-47.avi
/mcdv-047.mp4
/mcdv047.mp4
/mcdv0047.mp4
/1pondo-070409_621.mp4
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#1(181222)@RUNBKK/No-Watermarked/HOBD00015.FHD2.wmv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/RBD-406_1.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/MDYD-664B.mp4
/Volumes/Adult/Files/107NTTR-037A.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/SNIS-986 国民的アイドル アドレナリン大爆発禁欲1ヶ月後の性欲剥き出し焦らされトランスFUCK 三上悠亜【桃花族】.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/UPSM-109_2.mkv
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#2(181231)@RUNBKK/No-Watermarked/PPT003.SD3.wmv
/Volumes/Adult/Files/波多野结衣/THE波多野結衣 ぶっかけ50連発 CD1.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 後編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 前編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/加勒比 062212-055 夫の目の前で妻が ~元上司に縛られて~波多野結衣~.rmvb
/Volumes/Adult/Files/波多野结衣/022213-271-carib-whole_s.mp4
/Volumes/Adult/Files/SKYHD-001~010/SKYHD-009_H265.mkv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/大桥步兵合集/032015_161-caribpr-high.mp4
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/118ppt00016hhb2.mkv
/Volumes/Adult/Files/tia/soe935C.HD.wmv
/Volumes/Adult/Files/SKYHD-011~020/SKYHD-020_H265.mkv
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/VR/sivr00008_E.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/SHKD-744 営業課長の湿ったパンスト 里美ゆりあ.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/ABP-627 裏・鈴村あいり-鈴村あいりのオトナの激情SEX4本番 鈴村あいり.MP4
/Volumes/Adult/Files/Rating Top 30 JAV pack/20 ABP-408 上原瑞穂/上原瑞穂 ABP-408 无码流出片段/[ThZu.Cc]20150909164411.m2ts
/Volumes/Adult/Files/Caribbean-101717-520-HD/100917-515/100917-515-carib-1080p.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/ノ瀬アメリ/一ノ瀬アメリ~加勒比 VERY SEXY.wmv
/Volumes/Adult/Files/ノ瀬アメリ/20101202一瀬アメリ - 東京ブルドック05(inu006).avi
/Volumes/Adult/Files/ノ瀬アメリ/Sky Angel Vol 80 - CD2.mp4
/Volumes/Adult/Files/Mika Sumire すみれ美香/Caribbean-091818-755.mp4
/Volumes/Adult/Files/Takizawa Rola/[HD]abp-031C.wmv
/Volumes/Adult/Files/Takizawa Rola/ABP-013HDA.wmv

View File

@ -1,51 +0,0 @@
/Volumes/192.168.2.100/Adult/Files/Aki Sasaki Megapack/HODV-21222.mkv
/Volumes/Adult/Files/ノ瀬アメリ/Tokyo Hot N0646.avi
/Volumes/Adult/Files/ノ瀬アメリ/MKBD_S03-MaRieS.mp4
/Volumes/192.168.2.100/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-1 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999A 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-A 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-C 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/Rating Top 30 JAV pack/IPTD-999-B 彼女の姉貴とイケナイ関係 Rio.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935C.HD.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935B.HD.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935A.HD.wmv
/Volumes/192.168.2.100/Adult/Files/tia/soe935D.HD.wmv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/[Tokyo-Hot] [n1180] 美人秘書3穴串刺奉仕残業 (中井綾香 Ayaka Nakai)/(Tokyo-Hot)(n1180)美人秘書3穴串刺奉仕残業 中井綾香.mp4
/mcdv47.avi
/mcdv-47.avi
/mcdv-047.mp4
/mcdv047.mp4
/mcdv0047.mp4
/1pondo-070409_621.mp4
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#1(181222)@RUNBKK/No-Watermarked/HOBD00015.FHD2.wmv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/RBD-406_1.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/MDYD-664B.mp4
/Volumes/Adult/Files/107NTTR-037A.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/SNIS-986 国民的アイドル アドレナリン大爆発禁欲1ヶ月後の性欲剥き出し焦らされトランスFUCK 三上悠亜【桃花族】.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/UPSM-109_2.mkv
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#2(181231)@RUNBKK/No-Watermarked/PPT003.SD3.wmv
/Volumes/Adult/Files/波多野结衣/THE波多野結衣 ぶっかけ50連発 CD1.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 後編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 前編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/加勒比 062212-055 夫の目の前で妻が ~元上司に縛られて~波多野結衣~.rmvb
/Volumes/Adult/Files/波多野结衣/022213-271-carib-whole_s.mp4
/Volumes/Adult/Files/SKYHD-001~010/SKYHD-009_H265.mkv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/大桥步兵合集/032015_161-caribpr-high.mp4
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/118ppt00016hhb2.mkv
/Volumes/Adult/Files/SKYHD-011~020/SKYHD-020_H265.mkv
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/VR/sivr00008_E.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/SHKD-744 営業課長の湿ったパンスト 里美ゆりあ.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/ABP-627 裏・鈴村あいり-鈴村あいりのオトナの激情SEX4本番 鈴村あいり.MP4
/Volumes/Adult/Files/Rating Top 30 JAV pack/20 ABP-408 上原瑞穂/上原瑞穂 ABP-408 无码流出片段/[ThZu.Cc]20150909164411.m2ts
/Volumes/Adult/Files/Caribbean-101717-520-HD/100917-515/100917-515-carib-1080p.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/ノ瀬アメリ/一ノ瀬アメリ~加勒比 VERY SEXY.wmv
/Volumes/Adult/Files/ノ瀬アメリ/20101202一瀬アメリ - 東京ブルドック05(inu006).avi
/Volumes/Adult/Files/ノ瀬アメリ/Sky Angel Vol 80 - CD2.mp4
/Volumes/Adult/Files/Mika Sumire すみれ美香/Caribbean-091818-755.mp4
/Volumes/Adult/Files/Takizawa Rola/[HD]abp-031C.wmv
/Volumes/Adult/Files/Takizawa Rola/ABP-013HDA.wmv

View File

@ -1,50 +0,0 @@
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#1(181222)@RUNBKK/No-Watermarked/HOBD00015.FHD2.wmv
/1pondo-070409_621.mp4
/Volumes/Adult/Files/107NTTR-037.mp4
/Volumes/Adult/Files/107NTTR-037A.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/TEK-097 ふたりは無敵.wmv
/Volumes/Adult/Files/Yua.Mikami-PML/SNIS-986 国民的アイドル アドレナリン大爆発禁欲1ヶ月後の性欲剥き出し焦らされトランスFUCK 三上悠亜【桃花族】.mp4
/Volumes/Adult/Files/Yua.Mikami-PML/SSNI-030 三上悠亜ファン感謝祭 国民的アイドル×一般ユーザー20人ガチファンとSEX解禁ハメまくりスペシャル【桃花族】.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/MIDD-893A.mkv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 2/FHD/UPSM-109_2.mkv
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#2(181231)@RUNBKK/No-Watermarked/PPT003.SD3.wmv
/Volumes/Adult/Files/波多野结衣/THE波多野結衣 ぶっかけ50連発 CD1.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 後編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/欲しがり 前編 波多野結衣.wmv
/Volumes/Adult/Files/波多野结衣/加勒比 062212-055 夫の目の前で妻が ~元上司に縛られて~波多野結衣~.rmvb
/Volumes/Adult/Files/波多野结衣/022213-271-carib-whole_s.mp4
/Volumes/Adult/Files/桜木凛 Rin Sakuragi FHD Collection Pack Vol/BBI-183.wmv
/Volumes/Adult/Files/NOP-019 芭蕾教室 水嶋あずみ/NOP019B.HD.wmv
/Volumes/Adult/Files/一ノ瀬アメリ part2/栗栖エリカ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/一ノ瀬アメリ part2/Max Girls/Max Girls 24(xv804)伊東遥,Rio,小沢アリス,葉月しおり,一ノ瀬アメリ,ひなた結衣,藤崎りお.avi
/Volumes/Adult/Files/一ノ瀬アメリ part2/瀬アメリAmeri Ichinose/20091127一瀬アメリ - 一見面就做愛(xv801).avi
/Volumes/Adult/Files/Aki Sasaki Megapack/MSTG-003.mkv
/Volumes/Adult/Files/SKYHD-001~010/SKYHD-009_H265.mkv
/Volumes/Adult/Files/大桥步兵合集/LAFBD-41.LaForet.Girl.41.angel.and.devil.Miku.Ohashi.2015.Bluray.1080p.x264.ac3-MTeam.mkv
/Volumes/Adult/Files/大桥步兵合集/032015_161-caribpr-high.mp4
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/(PRESTIGE)(ABP-171)彼女のお姉さんは、誘惑ヤリたがり娘。桃谷エリカ.wmv
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/(PRESTIGE)(ABP-145)濃密な接吻と欲情ベロキス性交 04 桃谷エリカ.wmv
/Volumes/Adult/Files/桃谷绘里香(桃谷エリカ) 所有作品集合/118ppt00016hhb2.mkv
/Volumes/Adult/Files/tia/soe935C.HD.wmv
/Volumes/Adult/Files/SKYHD-011~020/SKYHD-020_H265.mkv
/Volumes/Adult/Files/sakumomo1203-PML/IDBD-795 ももに夢中 2018年日本人にもっとも愛された女優桜空ももPREMIUM BOX8時間BEST.mp4
/Volumes/Adult/Files/sakumomo1203-PML/IDBD-768 Gカップグラビアアイドル桜空もも初ベスト 原石 2【桃花族】.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/VR/sivr00008_E.mp4
/Volumes/Adult/Files/RION(りおん).Utsunomiya.Shion.宇都宮しをん(うつのみやしをん)/DMM.Video/onsd00899hhb3.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/SHKD-744 営業課長の湿ったパンスト 里美ゆりあ.mp4
/Volumes/Adult/Files/Rating Top 30 JAV pack/ABP-627 裏・鈴村あいり-鈴村あいりのオトナの激情SEX4本番 鈴村あいり.MP4
/Volumes/Adult/Files/Rating Top 30 JAV pack/20 ABP-408 上原瑞穂/上原瑞穂 ABP-408 无码流出片段/[ThZu.Cc]20150909164411.m2ts
/Volumes/Adult/Files/Caribbean-101717-520-HD/100917-515/100917-515-carib-1080p.mp4
/Volumes/Adult/Files/Kirara Asuka (@明日花キララ) FHD Pack Vol#3(190119)@RUNBKK/No-Watermarked/SOE976.FHD3.wmv
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/RBD-406_1.mp4
/Volumes/Adult/Files/(1.18TB) Julia movie pack collection Part 1/720p/MDYD-664B.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20081105栗栖エリカ - Sky Angel Blue 10 天舞超絕美少女天使降臨(skyhd010)(中文字幕).avi
/Volumes/Adult/Files/ノ瀬アメリ/一ノ瀬アメリ~加勒比 VERY SEXY.wmv
/Volumes/Adult/Files/ノ瀬アメリ/20101202一瀬アメリ - 東京ブルドック05(inu006).avi
/Volumes/Adult/Files/ノ瀬アメリ/Sky Angel Vol 80 - CD2.mp4
/Volumes/Adult/Files/ノ瀬アメリ/20100226一瀬アメリ - OL Style 制服(xv827).avi
/Volumes/Adult/Files/Mika Sumire すみれ美香/Caribbean-091818-755.mp4
/Volumes/Adult/Files/[Tokyo-Hot] [n1180] 美人秘書3穴串刺奉仕残業 (中井綾香 Ayaka Nakai)/(Tokyo-Hot)(n1180)美人秘書3穴串刺奉仕残業 中井綾香.mp4
/Volumes/Adult/Files/Takizawa Rola/[HD]abp-031C.wmv
/Volumes/Adult/Files/Takizawa Rola/ABP-013HDA.wmv
/Volumes/Adult/Files/Uncensored Mosaic Removal Megapack/ADN-017(Asami Ogawa).mp4

228
SiteSource/avsox.py → avsox.py Executable file → Normal file
View File

@ -1,116 +1,112 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys
# import io def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'})
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img d = {}
soup = BeautifulSoup(htmlcode, 'lxml') for i in a:
a = soup.find_all(attrs={'class': 'avatar-box'}) l = i.img['src']
d = {} t = i.span.get_text()
for i in a: p2 = {t: l}
l = i.img['src'] d.update(p2)
t = i.span.get_text() return d
p2 = {t: l} def getTitle(a):
d.update(p2) try:
return d html = etree.fromstring(a, etree.HTMLParser())
def getTitle(a): result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
try: return result.replace('/', '')
html = etree.fromstring(a, etree.HTMLParser()) except:
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] return ''
return result.replace('/', '') def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
except: soup = BeautifulSoup(a, 'lxml')
return '' a = soup.find_all(attrs={'class': 'avatar-box'})
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() d = []
soup = BeautifulSoup(a, 'lxml') for i in a:
a = soup.find_all(attrs={'class': 'avatar-box'}) d.append(i.span.get_text())
d = [] return d
for i in a: def getStudio(a):
d.append(i.span.get_text()) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return d result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
def getStudio(a): return result1
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() def getRuntime(a):
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return result1 result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
def getRuntime(a): return result1
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() def getLabel(a):
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return result1 result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
def getLabel(a): return result1
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() def getNum(a):
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return result1 result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
def getNum(a): return result1
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() def getYear(release):
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") try:
return result1 result = str(re.search('\d{4}',release).group())
def getYear(release): return result
try: except:
result = str(re.search('\d{4}',release).group()) return release
return result def getRelease(a):
except: html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return release result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
def getRelease(a): return result1
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() def getCover(htmlcode):
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") html = etree.fromstring(htmlcode, etree.HTMLParser())
return result1 result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
def getCover(htmlcode): return result
html = etree.fromstring(htmlcode, etree.HTMLParser()) def getCover_small(htmlcode):
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") html = etree.fromstring(htmlcode, etree.HTMLParser())
return result result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
def getCover_small(htmlcode): return result
html = etree.fromstring(htmlcode, etree.HTMLParser()) def getTag(a): # 获取演员
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") soup = BeautifulSoup(a, 'lxml')
return result a = soup.find_all(attrs={'class': 'genre'})
def getTag(a): # 获取演员 d = []
soup = BeautifulSoup(a, 'lxml') for i in a:
a = soup.find_all(attrs={'class': 'genre'}) d.append(i.get_text())
d = [] return d
for i in a:
d.append(i.get_text()) def main(number):
return d a = get_html('https://avsox.asia/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def main(number): result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
url = 'https://avsox.host/cn/search/' + number if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(url) a = get_html('https://avsox.asia/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() print(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
if result1 == '' or result1 == 'null' or result1 == 'None': result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) if result1 == '' or result1 == 'null' or result1 == 'None':
print(a) a = get_html('https://avsox.asia/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() print(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
if result1 == '' or result1 == 'null' or result1 == 'None': result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) web = get_html(result1)
print(a) soup = BeautifulSoup(web, 'lxml')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() info = str(soup.find(attrs={'class': 'row movie'}))
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") dic = {
web = get_html(result1) 'actor': getActor(web),
soup = BeautifulSoup(web, 'lxml') 'title': getTitle(web).strip(getNum(web)),
info = str(soup.find(attrs={'class': 'row movie'})) 'studio': getStudio(info),
dic = { 'outline': '',#
'actor': getActor(web), 'runtime': getRuntime(info),
'title': getTitle(web).strip(getNum(web)), 'director': '', #
'studio': getStudio(info), 'release': getRelease(info),
'outline': '',# 'number': getNum(info),
'runtime': getRuntime(info), 'cover': getCover(web),
'director': '', # 'cover_small': getCover_small(a),
'release': getRelease(info), 'imagecut': 3,
'number': getNum(info), 'tag': getTag(web),
'cover': getCover(web), 'label': getLabel(info),
'cover_small': getCover_small(a), 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
'imagecut': 3, 'actor_photo': getActorPhoto(web),
'tag': getTag(web), 'website': result1,
'label': getLabel(info), 'source': 'avsox.py',
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), }
'actor_photo': getActorPhoto(web), js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
'website': result1, return js
'source': 'avsox.py',
} #print(main('041516_541'))
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
#print(main('012717_472'))

36
config.ini Executable file → Normal file
View File

@ -1,35 +1,23 @@
[common] [common]
main_mode=2 main_mode=1
# 路径均为绝对路径,不要写入" '等符号 failed_output_folder=failed
search_folder= /Volumes/192.168.2.100/Adult/AVTest success_output_folder=JAV_output
# 如果failed_output_folder 为空,抓取不到相关信息的视频将不回移动
failed_output_folder= /Volumes/192.168.2.100/Adult/UnknownStars
success_output_folder= /Volumes/192.168.2.100/Adult/Files
#临时资源存储路径比如xxx.nfo 海报图
temp_folder= /Volumes/192.168.2.100/Adult/temp
# 如果是远程挂载的盘符建议不开启创建软连接软连接链接的是绝对路径远程NAS上的路径和本地挂载的路径一般不同。
soft_link=0
[proxy] [proxy]
#例子为socks代理配置可以 =后留空 proxy=127.0.0.1:1080
proxy= socks5h://127.0.0.1:1081 timeout=10
timeout= 10 retry=3
retry= 5
[Name_Rule] [Name_Rule]
location_rule= actor+'/'+number location_rule=actor+'/'+number
naming_rule= number+'-'+title naming_rule=number+'-'+title
[update] [update]
update_check=1 update_check=1
[media] [media]
#emby or plex or kodi ,emby=jellyfin media_warehouse=emby
media_warehouse=EMBY #emby or plex or kodi
[escape] [directory_capture]
literals=\() directory=
folders=/Volumes/Adult/UnknownStars,/Volumes/Adult/Stars
[debug_mode]
switch=1

1502
core.py

File diff suppressed because it is too large Load Diff

75
fc2fans_club.py Executable file
View File

@ -0,0 +1,75 @@
import re
from lxml import etree#need install
import json
import ADC_function
def getTitle(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/h2/text()')).strip(" ['']")
return result
def getActor(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[5]/a/text()')).strip(" ['']")
return result
except:
return ''
def getStudio(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[5]/a/text()')).strip(" ['']")
return result
except:
return ''
def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result
def getRelease(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']")
return result
def getCover(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']")
return 'http:' + result
def getOutline(htmlcode2): #获取番号 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[4]/p/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
return result
def getTag(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="container"]/div[1]/div/article/section[6]/ul/li/a/text()')
return result
def getYear(release):
try:
result = re.search('\d{4}',release).group()
return result
except:
return ''
def main(number):
number=number.replace('PPV','').replace('ppv','').strip('fc2_').strip('fc2-').strip('ppv-').strip('PPV-').strip('FC2_').strip('FC2-').strip('ppv-').strip('PPV-').replace('fc2ppv-','').replace('FC2PPV-','')
htmlcode2 = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'')
#htmlcode = ADC_function.get_html('http://fc2fans.club/html/FC2-' + number + '.html')
dic = {
'title': getTitle(htmlcode2),
'studio': getStudio(htmlcode2),
'year': getYear(getRelease(htmlcode2)),
'outline': getOutline(htmlcode2),
'runtime': getYear(getRelease(htmlcode2)),
'director': getStudio(htmlcode2),
'actor': getStudio(htmlcode2),
'release': getRelease(htmlcode2),
'number': 'FC2-'+number,
'cover': getCover(htmlcode2),
'imagecut': 0,
'tag': getTag(htmlcode2),
'actor_photo':'',
'website': 'http://adult.contents.fc2.com/article_search.php?id=' + number,
'source': 'fc2fans_club.py',
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
return js
#print(main('1145465'))

View File

@ -1,139 +1,137 @@
import re import re
from pyquery import PyQuery as pq#need install from pyquery import PyQuery as pq#need install
from lxml import etree#need install from lxml import etree#need install
from bs4 import BeautifulSoup#need install from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'star-name'}) a = soup.find_all(attrs={'class': 'star-name'})
d={} d={}
for i in a: for i in a:
l=i.a['href'] l=i.a['href']
t=i.get_text() t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(l), etree.HTMLParser())
p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']") p=str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
return d return d
def getTitle(htmlcode): #获取标题 def getTitle(htmlcode): #获取标题
doc = pq(htmlcode) doc = pq(htmlcode)
title=str(doc('div.container h3').text()).replace(' ','-') title=str(doc('div.container h3').text()).replace(' ','-')
try: try:
title2 = re.sub('n\d+-','',title) title2 = re.sub('n\d+-','',title)
return title2 return title2
except: except:
return title return title
def getStudio(htmlcode): #获取厂商 def getStudio(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
return result return result
def getYear(htmlcode): #获取年份 def getYear(htmlcode): #获取年份
html = etree.fromstring(htmlcode,etree.HTMLParser()) html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getCover(htmlcode): #获取封面链接 def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode) doc = pq(htmlcode)
image = doc('a.bigImage') image = doc('a.bigImage')
return image.attr('href') return image.attr('href')
def getRelease(htmlcode): #获取出版日期 def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getRuntime(htmlcode): #获取分钟 def getRuntime(htmlcode): #获取分钟
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find(text=re.compile('分鐘')) a = soup.find(text=re.compile('分鐘'))
return a return a
def getActor(htmlcode): #获取女优 def getActor(htmlcode): #获取女优
b=[] b=[]
soup=BeautifulSoup(htmlcode,'lxml') soup=BeautifulSoup(htmlcode,'lxml')
a=soup.find_all(attrs={'class':'star-name'}) a=soup.find_all(attrs={'class':'star-name'})
for i in a: for i in a:
b.append(i.get_text()) b.append(i.get_text())
return b return b
def getNum(htmlcode): #获取番号 def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result return result
def getDirector(htmlcode): #获取导演 def getDirector(htmlcode): #获取导演
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
return result return result
def getOutline(htmlcode): #获取演员 def getOutline(htmlcode): #获取演员
doc = pq(htmlcode) doc = pq(htmlcode)
result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text())
return result return result
def getSerise(htmlcode): def getSerise(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
return result return result
def getTag(htmlcode): # 获取演员 def getTag(htmlcode): # 获取演员
tag = [] tag = []
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'genre'}) a = soup.find_all(attrs={'class': 'genre'})
for i in a: for i in a:
if 'onmouseout' in str(i): if 'onmouseout' in str(i):
continue continue
tag.append(i.get_text()) tag.append(i.get_text())
return tag return tag
def main(number): def main(number):
try: try:
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
try: try:
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
except: except:
dww_htmlcode = '' dww_htmlcode = ''
dic = { dic = {
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
'studio': getStudio(htmlcode), 'studio': getStudio(htmlcode),
'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
'outline': getOutline(dww_htmlcode), 'outline': getOutline(dww_htmlcode),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode), 'director': getDirector(htmlcode),
'actor': getActor(htmlcode), 'actor': getActor(htmlcode),
'release': getRelease(htmlcode), 'release': getRelease(htmlcode),
'number': getNum(htmlcode), 'number': getNum(htmlcode),
'cover': getCover(htmlcode), 'cover': getCover(htmlcode),
'imagecut': 1, 'imagecut': 1,
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
'label': getSerise(htmlcode), 'label': getSerise(htmlcode),
'actor_photo': getActorPhoto(htmlcode), 'actor_photo': getActorPhoto(htmlcode),
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source' : 'javbus.py', 'source' : 'javbus.py',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except: except:
return main_uncensored(number) return main_uncensored(number)
def main_uncensored(number):
def main_uncensored(number): # 无码 htmlcode = get_html('https://www.javbus.com/' + number)
htmlcode = get_html('https://www.javbus.com/' + number) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) if getTitle(htmlcode) == '':
if getTitle(htmlcode) == '': htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_'))
htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", ''))
dww_htmlcode = get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) dic = {
dic = { 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))).replace(getNum(htmlcode)+'-', ''), 'studio': getStudio(htmlcode),
'studio': getStudio(htmlcode), 'year': getYear(htmlcode),
'year': getYear(htmlcode), 'outline': getOutline(dww_htmlcode),
'outline': getOutline(dww_htmlcode), 'runtime': getRuntime(htmlcode),
'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode),
'director': getDirector(htmlcode), 'actor': getActor(htmlcode),
'actor': getActor(htmlcode), 'release': getRelease(htmlcode),
'release': getRelease(htmlcode), 'number': getNum(htmlcode),
'number': getNum(htmlcode), 'cover': getCover(htmlcode),
'cover': getCover(htmlcode), 'tag': getTag(htmlcode),
'tag': getTag(htmlcode), 'label': getSerise(htmlcode),
'label': getSerise(htmlcode), 'imagecut': 0,
'imagecut': 0, 'actor_photo': '',
'actor_photo': '', 'website': 'https://www.javbus.com/' + number,
'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py',
'source': 'javbus.py', }
} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
return js

139
javdb.py Executable file
View File

@ -0,0 +1,139 @@
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
def getTitle(a):
try:
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/section/div/h2/strong/text()')).strip(" ['']")
return re.sub('.*\] ','',result.replace('/', ',').replace('\\xa0','').replace(' : ',''))
except:
return re.sub('.*\] ','',result.replace('/', ',').replace('\\xa0',''))
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0","").replace("'","").replace(' ','').replace(',,','').lstrip(',').replace(',',', ')
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"製作")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"製作")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+')
def getYear(getRelease):
try:
result = str(re.search('\d{4}',getRelease).group())
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace(",\\xa0","").replace("'","").replace(' ','').replace(',,','').lstrip(',')
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/section/div/div[2]/div[1]/a/img/@src')).strip(" ['']")
if result == '':
result = str(html.xpath('/html/body/section/div/div[3]/div[1]/a/img/@src')).strip(" ['']")
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return result
def main(number):
try:
a = get_html('https://javdb.com/search?q=' + number + '&f=all')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="videos"]/div/div/a/@href')).strip(" ['']")
if result1 == '':
a = get_html('https://javdb.com/search?q=' + number.replace('-', '_') + '&f=all')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="videos"]/div/div/a/@href')).strip(" ['']")
b = get_html('https://javdb1.com' + result1)
soup = BeautifulSoup(b, 'lxml')
a = str(soup.find(attrs={'class': 'panel'}))
dic = {
'actor': getActor(a),
'title': getTitle(b).replace("\\n", '').replace(' ', '').replace(getActor(a), '').replace(getNum(a),
'').replace(
'无码', '').replace('有码', '').lstrip(' '),
'studio': getStudio(a),
'outline': getOutline(a),
'runtime': getRuntime(a),
'director': getDirector(a),
'release': getRelease(a),
'number': getNum(a),
'cover': getCover(b),
'imagecut': 0,
'tag': getTag(a),
'label': getLabel(a),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://javdb1.com' + result1,
'source': 'javdb.py',
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
a = get_html('https://javdb.com/search?q=' + number + '&f=all')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="videos"]/div/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null':
a = get_html('https://javdb.com/search?q=' + number.replace('-', '_') + '&f=all')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="videos"]/div/div/a/@href')).strip(" ['']")
b = get_html('https://javdb.com' + result1)
soup = BeautifulSoup(b, 'lxml')
a = str(soup.find(attrs={'class': 'panel'}))
dic = {
'actor': getActor(a),
'title': getTitle(b).replace("\\n", '').replace(' ', '').replace(getActor(a), '').replace(
getNum(a),
'').replace(
'无码', '').replace('有码', '').lstrip(' '),
'studio': getStudio(a),
'outline': getOutline(a),
'runtime': getRuntime(a),
'director': getDirector(a),
'release': getRelease(a),
'number': getNum(a),
'cover': getCover(b),
'imagecut': 0,
'tag': getTag(a),
'label': getLabel(a),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://javdb.com' + result1,
'source': 'javdb.py',
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
#print(main('061519-861'))

Binary file not shown.

Binary file not shown.

Binary file not shown.

0
readme/This is readms.md's images folder Executable file → Normal file
View File

0
readme/flow_chart2.png Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 101 KiB

After

Width:  |  Height:  |  Size: 101 KiB

0
readme/readme1.PNG Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 1.1 KiB

After

Width:  |  Height:  |  Size: 1.1 KiB

0
readme/readme2.PNG Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 3.4 KiB

0
readme/readme3.PNG Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

0
readme/readme4.PNG Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

BIN
readme/readme5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 457 KiB

0
readme/single.gif Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 68 KiB

After

Width:  |  Height:  |  Size: 68 KiB

View File

@ -1 +0,0 @@
1

Binary file not shown.

Before

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

View File

@ -1 +0,0 @@
pipenv install -rlxml bs4 pillow pyquery

Binary file not shown.

Before

Width:  |  Height:  |  Size: 68 KiB

View File

@ -1,108 +1,104 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys
# import io def getTitle(a):
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) try:
html = etree.fromstring(a, etree.HTMLParser())
def getTitle(a): result = str(html.xpath('//*[@id="center_column"]/div[2]/h1/text()')).strip(" ['']")
try: return result.replace('/', ',')
html = etree.fromstring(a, etree.HTMLParser()) except:
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") return ''
return result.replace('/', ',') def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
except: html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
return '' result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') def getStudio(a):
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
def getStudio(a): result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() return str(result1+result2).strip('+').replace("', '",'').replace('"','')
result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') def getRuntime(a):
result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return str(result1+result2).strip('+').replace("', '",'').replace('"','') result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
def getRuntime(a): result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() return str(result1 + result2).strip('+').rstrip('mi')
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') def getLabel(a):
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return str(result1 + result2).strip('+').rstrip('mi') result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
def getLabel(a): '\\n')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( '\\n')
'\\n') return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( def getNum(a):
'\\n') html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
def getNum(a): '\\n')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( '\\n')
'\\n') return str(result1 + result2).strip('+')
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( def getYear(getRelease):
'\\n') try:
return str(result1 + result2).strip('+') result = str(re.search('\d{4}',getRelease).group())
def getYear(getRelease): return result
try: except:
result = str(re.search('\d{4}',getRelease).group()) return getRelease
return result def getRelease(a):
except: html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return getRelease result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
def getRelease(a): '\\n')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( '\\n')
'\\n') return str(result1 + result2).strip('+')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( def getTag(a):
'\\n') html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
return str(result1 + result2).strip('+') result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
def getTag(a): '\\n')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( '\\n')
'\\n') return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( def getCover(htmlcode):
'\\n') html = etree.fromstring(htmlcode, etree.HTMLParser())
return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') result = str(html.xpath('//*[@id="center_column"]/div[2]/div[1]/div/div/h2/img/@src')).strip(" ['']")
def getCover(htmlcode): return result
html = etree.fromstring(htmlcode, etree.HTMLParser()) def getDirector(a):
result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
return result '\\n')
def getDirector(a): result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() '\\n')
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
'\\n') def getOutline(htmlcode):
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( html = etree.fromstring(htmlcode, etree.HTMLParser())
'\\n') result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') return result
def getOutline(htmlcode): def main(number2):
html = etree.fromstring(htmlcode, etree.HTMLParser()) number=number2.upper()
result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") htmlcode=get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})
return result soup = BeautifulSoup(htmlcode, 'lxml')
def main(number2): a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
number=number2.upper() dic = {
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''),
soup = BeautifulSoup(htmlcode, 'lxml') 'studio': getStudio(a),
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') 'outline': getOutline(htmlcode),
dic = { 'runtime': getRuntime(a),
'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), 'director': getDirector(a),
'studio': getStudio(a), 'actor': getActor(a),
'outline': getOutline(htmlcode), 'release': getRelease(a),
'runtime': getRuntime(a), 'number': getNum(a),
'director': getDirector(a), 'cover': getCover(htmlcode),
'actor': getActor(a), 'imagecut': 0,
'release': getRelease(a), 'tag': getTag(a),
'number': getNum(a), 'label':getLabel(a),
'cover': getCover(htmlcode), 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'imagecut': 0, 'actor_photo': '',
'tag': getTag(a), 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/',
'label':getLabel(a), 'source': 'siro.py',
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), }
'actor_photo': '', js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8')
'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', return js
'source': 'mgstage.py',
} #print(main('300maan-373'))
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
#print(main('SIRO-3607'))

80
test.py
View File

@ -1,80 +0,0 @@
import os
import re
from itertools import groupby
import fuckit as fuckit
import pandas as pd
from tenacity import retry, stop_after_delay, wait_fixed
def go():
a = [1, 2, 3, 4, 5, 6]
# [print(x) for x in a]
# [print(x) for x in a]
a1 = groupby(a, key=lambda k: (k / 2))
for i in a1:
print(i)
for i in a1:
print(i)
class TryDo:
def __init__(self, func, times=3):
self.tries = times
self.func = func
def __iter__(self):
self.currentTry = 1
return self
def __next__(self):
if self.currentTry > self.tries:
raise StopIteration(False)
else:
self.currentTry += 1
self.func()
raise StopIteration(True)
# def do(self):
@retry(stop=stop_after_delay(3), wait=wait_fixed(2))
def stop_after_10_s():
print("Stopping after 10 seconds")
raise Exception
# f = iter( TryDo(do_something, 5))
# stop_after_10_s()
def errorfunc():
raise Exception
def okfunc():
print("ok")
# with fuckit:
# errorfunc()
# okfunc()
# re.match()
r = re.search(r'(?<=999)-?((?P<alpha>([A-Z](?![A-Z])))|(?P<num>\d(?!\d)))', "IPTD-999-B-彼女の姉貴とイケナイ関係-RIO", re.I)
#
print(r.groupdict())
print(r.groupdict()['alpha'])
print(r.group(2))
import re
line = "Cats are smarter than dogs"
matchObj = re.search(r'(?<=a)(.*) are (.*?) .*', line, re.M | re.I)
if matchObj:
print("matchObj.group() : ", matchObj.group())
print("matchObj.group(1) : ", matchObj.group(1))
print("matchObj.group(2) : ", matchObj.group(2))
else:
print("No match!!")
# print(r[-1])
# print(newList)

8
update_check.json Executable file → Normal file
View File

@ -1,5 +1,5 @@
{ {
"version": "2.8.2", "version": "1.3",
"version_show":"2.8.2", "version_show":"1.3",
"download": "https://github.com/yoshiko2/AV_Data_Capture/releases" "download": "https://github.com/wenead99/AV_Data_Capture/releases"
} }