From 575a710ef8921e1e60e95fe044973b50b544468e Mon Sep 17 00:00:00 2001 From: wenead99 <42309414+wenead99@users.noreply.github.com> Date: Sat, 22 Jun 2019 16:16:18 +0800 Subject: [PATCH] =?UTF-8?q?Beta=2010.6=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 4 +- core.py | 27 ++++++------ siro.py | 101 +++++++++++++++++++++++++-------------------- 3 files changed, 72 insertions(+), 60 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 42b8ba1..98de9c6 100644 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -6,7 +6,7 @@ import sys from ADC_function import * import json -version='0.10.5' +version='0.10.6' def UpdateCheck(): html2 = get_html('https://raw.githubusercontent.com/wenead99/AV_Data_Capture/master/update_check.json') @@ -78,4 +78,4 @@ if __name__ =='__main__': print("[!]Cleaning empty folders") CEF('JAV_output') print("[+]All finished!!!") - input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看错误信息。") + input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看错误信息。") \ No newline at end of file diff --git a/core.py b/core.py index 27a0e44..9801d9d 100644 --- a/core.py +++ b/core.py @@ -55,6 +55,7 @@ def getNumberFromFilename(filepath): global cover global imagecut global tag + global image_main global naming_rule global location_rule @@ -122,19 +123,19 @@ def getNumberFromFilename(filepath): - title = json_data['title'] - studio = json_data['studio'] - year = json_data['year'] - outline = json_data['outline'] - runtime = json_data['runtime'] - director = json_data['director'] - actor_list= str(json_data['actor']).strip("[ ]").replace("'",'').replace(" ",'').split(',') #字符串转列表 - release = json_data['release'] - number = json_data['number'] - cover = json_data['cover'] - imagecut = json_data['imagecut'] - tag = str(json_data['tag']).strip("[ ]").replace("'",'').replace(" ",'').split(',') #字符串转列表 - actor = str(actor_list).strip("[ ]").replace("'",'').replace(" ",'') + title = json_data['title'] + studio = json_data['studio'] + year = json_data['year'] + outline = json_data['outline'] + runtime = json_data['runtime'] + director = json_data['director'] + actor_list = str(json_data['actor']).strip("[ ]").replace("'",'').replace(" ",'').split(',') #字符串转列表 + release = json_data['release'] + number = json_data['number'] + cover = json_data['cover'] + imagecut = json_data['imagecut'] + tag = str(json_data['tag']).strip("[ ]").replace("'",'').replace(" ",'').split(',') #字符串转列表 + actor = str(actor_list).strip("[ ]").replace("'",'').replace(" ",'') #====================处理异常字符====================== #\/:*?"<>| #if "\\" in title or "/" in title or ":" in title or "*" in title or "?" in title or '"' in title or '<' in title or ">" in title or "|" in title or len(title) > 200: diff --git a/siro.py b/siro.py index aad875e..3692bdd 100644 --- a/siro.py +++ b/siro.py @@ -8,81 +8,92 @@ from ADC_function import * def getTitle(a): html = etree.fromstring(a, etree.HTMLParser()) result = str(html.xpath('//*[@id="center_column"]/div[2]/h1/text()')).strip(" ['']") - return result + return result.replace('/',',') def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() - result2=str(html.xpath('//table/tr[1]/td[1]/text()')).strip(" ['\\n ']") - result1 = str(html.xpath('//table/tr[1]/td[1]/a/text()')).strip(" ['\\n ']") - return str(result1+result2).strip('+') + result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table[2]/tr[2]/td/text()')).strip(" ['\\n ']") - result1 = str(html.xpath('//table/tr[2]/td[1]/a/text()')).strip(" ['\\n ']") - return str(result1+result2).strip('+') + html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() + result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1+result2).strip('+').replace("', '",'').replace('"','') def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table/tr[3]/td[1]/text()')).strip(" ['\\n ']") - result1 = str(html.xpath('//table/tr[3]/td[1]/a/text()')).strip(" ['\\n ']") - return str(result1 + result2).strip('+').strip('mi') + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + return str(result1 + result2).strip('+').rstrip('mi') def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table/tr[6]/td[1]/text()')).strip(" ['\\n ']") - result1 = str(html.xpath('//table/tr[6]/td[1]/a/text()')).strip(" ['\\n ']") - return str(result1 + result2).strip('+') + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '",'').replace('"','') def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table/tr[2]/td[4]/a/text()')).strip(" ['\\n ']") - result1 = str(html.xpath('//table/tr[2]/td[4]/text()')).strip(" ['\\n ']") + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') return str(result1 + result2).strip('+') -def getYear(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table/tr[2]/td[5]/a/text()')).strip(" ['\\n ']") - result1=str(html.xpath('//table/tr[2]/td[5]/text()')).strip(" ['\\n ']") - return result2+result1 +def getYear(getRelease): + try: + result = str(re.search('\d{4}',getRelease).group()) + return result + except: + return getRelease def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table/tr[5]/td[1]/text()')).strip(" ['\\n ']") - result1 = str(html.xpath('//table/tr[5]/a/td[1]/text()')).strip(" ['\\n ']") + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') return str(result1 + result2).strip('+') def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) - result2=str(html.xpath('//table/tr[8]/td[1]/a/text()')).strip(" ['\\n ']") - result1=str(html.xpath('//table/tr[8]/td[1]/text()')).strip(" ['\\n ']") - return str(result1 + result2).strip('+') + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="center_column"]/div[2]/div[1]/div/div/h2/img/@src')).strip(" ['']") return result def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//table/tr[2]/td[1]/text()')).strip(" ['\\n ']") - result2 = str(html.xpath('//table/tr[2]/td[1]/a/text()')).strip(" ['\\n ']") - return str(result1 + result2).strip('+') + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '",'').replace('"','') def getOutline(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") return result -def main(number): - htmlcode=get_html('https://www.mgstage.com/product/product_detail/'+str(number),cookies={'adc':'1'}) +def main(number2): + number=number2.upper() + htmlcode=get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}) soup = BeautifulSoup(htmlcode, 'lxml') - a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','') - #print(a) + a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') dic = { 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), 'studio': getStudio(a), - 'year': str(re.search('\d{4}',getRelease(a)).group()), 'outline': getOutline(htmlcode), 'runtime': getRuntime(a), 'director': getDirector(a), 'actor': getActor(a), 'release': getRelease(a), - 'number': number, + 'number': getNum(a), 'cover': getCover(htmlcode), 'imagecut': 0, - 'tag': getTag(a).replace("'\\n',",'').replace(' ', '').replace("\\n','\\n",','), - 'label':getLabel(a) + 'tag': getTag(a), + 'label':getLabel(a), + 'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') - #print('https://www.mgstage.com/product/product_detail/'+str(number)) return js -#print(main('SIRO-3552')) \ No newline at end of file + +#print(main('200GANA-1624')) \ No newline at end of file