Spaces:
Running
Running
File size: 7,137 Bytes
1c817fd 15faeca 1c817fd 0ff6756 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import requests
from bs4 import BeautifulSoup
from faker import Faker
from urllib.request import urlretrieve
import urllib.request
from urllib3.util.retry import Retry
import time
import os
import wget
import json
import unicodedata
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import warnings
from requests.adapters import HTTPAdapter
from constants import *
MAX_XDD = 5
use_google_search = True
use_20newsgroup = True
fake = Faker()
def create_retry_session():
retry_strategy = Retry(
total=5,
status_forcelist=[429, 500, 502, 503, 504],
method_whitelist=["GET"],
backoff_factor=1,
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)
return http
def get_google_search_results(query, retry_session):
if not use_google_search:
return []
headers = {"User-Agent": fake.user_agent()}
search_url = f"https://www.google.com/search?q={query}"
try:
response = retry_session.get(search_url, headers=headers, timeout=10)
response.raise_for_status()
except requests.exceptions.RequestException as e:
return []
soup = BeautifulSoup(response.text, "html.parser")
search_results = []
for a_tag in soup.find_all('a', href=True):
if 'url?q=' in a_tag['href'] and not a_tag['href'].startswith("https://accounts.google.com"):
search_results.append(a_tag['href'].split('url?q=')[1].split('&')[0])
return search_results
def fetch_20newsgroup_data():
if not use_20newsgroup:
return []
try:
newsgroups_train = fetch_20newsgroups(subset='train', categories=['talk.trivia', 'rec.sport.baseball', 'sci.med', 'comp.sys.ibm.pc.hardware', 'soc.religion.christian'])
data = newsgroups_train.data
return data
except Exception as e:
return []
def download_file(url, filename, folder, retries=3):
filepath = os.path.join(folder, filename)
if os.path.exists(filepath):
return True
os.makedirs(folder, exist_ok=True)
for attempt in range(retries):
try:
wget.download(url, out=filepath)
return True
except Exception as e:
if attempt < retries - 1:
time.sleep(2)
else:
return False
return False
def download_gpt2_files(folder, model_url, model_file, encoder_url, encoder_file, vocab_url, vocab_file):
if not os.path.exists(folder):
os.makedirs(folder)
if not os.path.exists(os.path.join(folder, model_file)):
download_file(model_url, model_file, folder)
if not os.path.exists(os.path.join(folder, encoder_file)):
download_file(encoder_url, encoder_file, folder)
if not os.path.exists(os.path.join(folder, vocab_file)):
download_file(vocab_url, vocab_file, folder)
def download_translation_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_codegen_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_summarization_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_imagegen_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_image_to_3d_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_text_to_video_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_sentiment_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_stt_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_tts_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def download_musicgen_files(folder, model_files_urls):
if not os.path.exists(folder):
os.makedirs(folder)
for url, filename in model_files_urls:
if not os.path.exists(os.path.join(folder, filename)):
download_file(url, filename, folder)
def bytes_to_unicode_gpt2():
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n = n+1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_codegen_tokenizer_pure(vocab_file, merges_file):
vocab = json.load(open(vocab_file))
merges = open(merges_file, 'r', encoding="utf-8").read().split('\n')[1:-1]
bpe_ranks = dict(zip(merges, range(len(merges))))
byte_encoder = bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}
tokenizer_regex = re.compile(r'''<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+''')
tokenize = lambda text: re.findall(tokenizer_regex, text)
encoder_obj = Encoder(
encoder=vocab,
decoder={v: u for u, v in vocab.items()},
bpe_ranks=bpe_ranks,
byte_encoder=byte_encoder,
byte_decoder=byte_decoder,
tokenize=tokenize
)
return encoder_obj |